Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
67 void cuda_masked_copy(void *a, void *b, void *mask,
68 int *n, int *m, cudaStream_t strm) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 ((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_gather_copy(void *a, void *b, void *mask,
83 int *n, int *m, cudaStream_t strm) {
84
85 const dim3 nthrds(1024, 1, 1);
86 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
87
89 ((real *) a, (real*) b,(int*) mask, *n, *m);
91
92 }
93
97 void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask,
98 int *n, int *m, cudaStream_t strm) {
99
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
102
104 ((real *) a, (real*) b,(int*) mask, *n, *m);
106
107 }
108
112 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
113 int *n, int *m, cudaStream_t strm) {
114
115 const dim3 nthrds(1024, 1, 1);
116 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
117
119 ((real *) a, (real *) b, (int *) mask, *n, *m);
121
122 }
126 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
127 int *n, int *m, cudaStream_t strm) {
128
129 const dim3 nthrds(1024, 1, 1);
130 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
131
133 ((real *) a, (real*) b,(int*) mask, *n, *m);
135 }
136
137
141 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
143
144 const dim3 nthrds(1024, 1, 1);
145 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
146
148 ((real*)a, *c, *size, mask, *mask_size);
150 }
151
155 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
156 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
157 }
158
162 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
163
164 const dim3 nthrds(1024, 1, 1);
165 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
166
167 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
169
170 }
171
175 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
176
177 const dim3 nthrds(1024, 1, 1);
178 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
179
181 ((real *) a, (real *) b, *c, *n);
183
184 }
185
189 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
190
191 const dim3 nthrds(1024, 1, 1);
192 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
193
194 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
196
197 }
198
202 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
203
204 const dim3 nthrds(1024, 1, 1);
205 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
206
208 ((real *) a, (real *) b, *c, *n);
210
211 }
212
216 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
217
218 const dim3 nthrds(1024, 1, 1);
219 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
220
221 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
223
224 }
225
230 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
231
232 const dim3 nthrds(1024, 1, 1);
233 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
234
236 ((real *) a, (real *) b, *c, *n);
238
239 }
240
244 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
245
246 const dim3 nthrds(1024, 1, 1);
247 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
248
249 if (*n > 0){
250 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
252 }
253
254 }
255
260 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
261
262 const dim3 nthrds(1024, 1, 1);
263 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
264
266 ((real *) a, (real *) b, *n);
268
269 }
270
275 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
276
277 const dim3 nthrds(1024, 1, 1);
278 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
279
281 ((real *) a, (real *) b, (real *) c, *n);
283 }
284
289 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294
296 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
298
299 }
305 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
306
307 const dim3 nthrds(1024, 1, 1);
308 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
309
311 ((real *) a, (real *) b, *c1, *n);
313
314 }
315
321 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
322
323 const dim3 nthrds(1024, 1, 1);
324 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
325
327 ((real *) a, (real *) b, *c1, *n);
329
330 }
331
338 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
340
341 const dim3 nthrds(1024, 1, 1);
342 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
343
345 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
347
348 }
349
355 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
356
357 const dim3 nthrds(1024, 1, 1);
358 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
359
361 ((real *) a, (real *) b, *c1, *n);
363
364 }
365
371 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
373
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
376
378 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
380
381 }
382
388 void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
389 real *c3, int *n, cudaStream_t strm) {
390
391 const dim3 nthrds(1024, 1, 1);
392 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
393
395 ((real *) a, (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
397
398 }
399
405 void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
406 real *c2, real *c3, real *c4, int *n, cudaStream_t strm) {
407
408 const dim3 nthrds(1024, 1, 1);
409 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
410
412 ((real *) a, (real *) b, (real *) c, (real *) d, (real *) e,
413 *c1, *c2, *c3, *c4, *n);
415
416 }
417
422 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
423
424 const dim3 nthrds(1024, 1, 1);
425 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
426
427 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
429 }
430
435 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
436
437 const dim3 nthrds(1024, 1, 1);
438 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
439
441 ((real *) a, (real *) b, *n);
443 }
444
449 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
450
451 const dim3 nthrds(1024, 1, 1);
452 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
453
455 ((real *) a, (real *) b, (real *) c, *n);
457 }
458
463 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
464
465 const dim3 nthrds(1024, 1, 1);
466 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
467
468 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
470 }
471
476 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
477
478 const dim3 nthrds(1024, 1, 1);
479 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
480
482 ((real *) a, (real *) b, (real *) c, *n);
484 }
485
490 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
491
492 const dim3 nthrds(1024, 1, 1);
493 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
494
496 ((real *) a, (real *) b, (real *) c, *n);
498 }
499
500
505 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
506
507 const dim3 nthrds(1024, 1, 1);
508 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
509
511 ((real *) a, (real *) b, *n);
513 }
514
519 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
520
521 const dim3 nthrds(1024, 1, 1);
522 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
523
525 ((real *) a, (real *) b, (real *) c, *n);
527 }
528
533 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
534
535 const dim3 nthrds(1024, 1, 1);
536 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
537
539 ((real *) a, (real *) b, (real *) c, *n);
541 }
542
547 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
549
550 const dim3 nthrds(1024, 1, 1);
551 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
552
554 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
556 }
557
562 void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n,
564
565 const dim3 nthrds(1024, 1, 1);
566 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
567
569 ((real *) a, (real *) b, (real *) c, *s, *n);
571 }
572
577 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
578 void *v1, void *v2, void *v3, int *n,
580
581 const dim3 nthrds(1024, 1, 1);
582 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
583
585 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
586 (real *) v1, (real *) v2, (real *) v3, *n);
588 }
589
594 void cuda_vcross(void *u1, void *u2, void *u3,
595 void *v1, void *v2, void *v3,
596 void *w1, void *w2, void *w3,
597 int *n, cudaStream_t strm) {
598
599 const dim3 nthrds(1024, 1, 1);
600 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
601
603 ((real *) u1, (real *) u2, (real *) u3,
604 (real *) v1, (real *) v2, (real *) v3,
605 (real *) w1, (real *) w2, (real *) w3, *n);
607 }
608
609
610 /*
611 * Reduction buffer
612 */
613 int red_s = 0;
615 void * bufred_d = NULL;
616
621 if ( nb >= red_s) {
622 red_s = nb+1;
623 if (bufred != NULL) {
625#ifdef HAVE_NVSHMEM
627#else
629#endif
630 }
632#ifdef HAVE_NVSHMEM
633 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
634#else
636#endif
637 }
638 }
639
644 int n, const cudaStream_t stream) {
645
646#ifdef HAVE_NCCL
648 DEVICE_NCCL_SUM, stream);
650 cudaMemcpyDeviceToHost, stream));
651 cudaStreamSynchronize(stream);
652#elif HAVE_NVSHMEM
653 if (sizeof(real) == sizeof(float)) {
655 (float *) bufred_d,
656 (float *) bufred_d, n, stream);
657 }
658 else if (sizeof(real) == sizeof(double)) {
660 (double *) bufred_d,
661 (double *) bufred_d, n, stream);
662
663 }
665 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
666 cudaStreamSynchronize(stream);
667#elif HAVE_DEVICE_MPI
668 cudaStreamSynchronize(stream);
670#else
672 cudaMemcpyDeviceToHost, stream));
673 cudaStreamSynchronize(stream);
674#endif
675 }
676
677
678
683 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
684
685 const dim3 nthrds(1024, 1, 1);
686 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
687 const int nb = ((*n) + 1024 - 1)/ 1024;
688
690
691 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
692 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
694 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
696
698 cudaMemcpyDeviceToHost, stream));
699 cudaStreamSynchronize(stream);
700
701 return bufred[0];
702 }
703
704
705
706
711 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
712
713 const dim3 nthrds(1024, 1, 1);
714 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
715 const int nb = ((*n) + 1024 - 1)/ 1024;
716
718
719 if ( *n > 0) {
720 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
721 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
723 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
725 }
726 else {
727 cuda_rzero(bufred_d, &red_s, stream);
728 }
730
731 return bufred[0];
732 }
733
738 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
739 cudaStream_t stream){
740 int pow2 = 1;
741 while(pow2 < (*j)){
742 pow2 = 2*pow2;
743 }
744 const int nt = 1024/pow2;
745 const dim3 nthrds(pow2, nt, 1);
746 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
747 const int nb = ((*n) + nt - 1)/nt;
748
750
751 if ( *n > 0) {
752 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
753 ((const real *) w, (const real **) v,
754 (const real *)mult, (real *)bufred_d, *j, *n);
757 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
759 }
760 else {
761 cuda_rzero(bufred_d, &red_s, stream);
762 }
763 cuda_global_reduce_add(h, bufred_d, (*j), stream);
764 }
765
770 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
771
772 const dim3 nthrds(1024, 1, 1);
773 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
774 const int nb = ((*n) + 1024 - 1)/ 1024;
775
777
778 if ( *n > 0) {
780 <<<nblcks, nthrds, 0, stream>>>((real *) a,
781 (real *) b,
782 (real *) bufred_d, *n);
784 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
786 }
787 else {
788 cuda_rzero(bufred_d, &red_s, stream);
789 }
791
792 return bufred[0];
793 }
794
799 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
800
801 const dim3 nthrds(1024, 1, 1);
802 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
803 const int nb = ((*n) + 1024 - 1)/ 1024;
804
806
807 if ( *n > 0) {
809 <<<nblcks, nthrds, 0, stream>>>((real *) a,
810 (real *) b,
811 (real *) bufred_d, *n);
813 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
815 }
816 else {
817 cuda_rzero(bufred_d, &red_s, stream);
818 }
820
821 return bufred[0];
822 }
823
828 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
829 const dim3 nthrds(1024, 1, 1);
830 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
831 const int nb = ((*n) + 1024 - 1)/ 1024;
832
834 if ( *n > 0) {
836 <<<nblcks, nthrds, 0, stream>>>((real *) a,
837 (real *) bufred_d, *n);
839 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
841 }
842 else {
843 cuda_rzero(bufred_d, &red_s, stream);
844 }
845
847
848 return bufred[0];
849 }
850
855 void cuda_absval(void *a, int *n, cudaStream_t stream) {
856
857 const dim3 nthrds(1024, 1, 1);
858 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
859
861 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
863
864 }
865
866 // ======================================================================== //
867 // Point-wise operations.
868
873 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
874
875 const dim3 nthrds(1024, 1, 1);
876 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
877
878 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
879 ((real *)a, (real *)b, *n);
881 }
882
887 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
888
889 const dim3 nthrds(1024, 1, 1);
890 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
891
892 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
893 ((real *)a, (real *)b, (real *)c, *n);
895 }
896
901 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
902
903 const dim3 nthrds(1024, 1, 1);
904 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
905
906 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
907 ((real *)a, *c, *n);
909 }
910
915 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
916
917 const dim3 nthrds(1024, 1, 1);
918 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
919
920 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
921 ((real *)a, (real *)b, *c, *n);
923 }
924
929 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
930
931 const dim3 nthrds(1024, 1, 1);
932 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
933
934 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
935 ((real *)a, (real *)b, *n);
937 }
938
943 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
944
945 const dim3 nthrds(1024, 1, 1);
946 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
947
948 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
949 ((real *)a, (real *)b, (real *)c, *n);
951 }
952
957 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
958
959 const dim3 nthrds(1024, 1, 1);
960 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
961
962 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
964 }
965
970 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
971
972 const dim3 nthrds(1024, 1, 1);
973 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
974
975 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
976 ((real *)a, (real *)b, *c, *n);
978 }
979
980 // ======================================================================== //
981
985 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
986
987 const dim3 nthrds(1024, 1, 1);
988 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
989
990 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
991 ((int *) a, *c, *n);
993
994 }
995
996} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:915
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:547
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:321
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:799
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:577
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:371
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:643
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:770
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:533
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:929
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:887
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:175
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:305
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:476
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:338
void * bufred_d
Definition math.cu:615
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:519
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:141
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:435
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:901
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:67
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:463
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:126
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:260
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:620
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:162
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:289
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:505
real * bufred
Definition math.cu:614
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:449
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:594
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:82
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:828
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:711
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:683
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:970
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:230
int red_s
Definition math.cu:613
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:355
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:275
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
Definition math.cu:562
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:957
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:244
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:738
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:855
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:155
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:202
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:189
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:985
void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:97
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:873
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:112
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
Definition math.cu:405
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:422
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:216
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
Definition math.cu:388
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:490
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:943
Object for handling masks in Neko.
Definition mask.f90:34