Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
67 void cuda_masked_copy(void *a, void *b, void *mask,
68 int *n, int *m, cudaStream_t strm) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 ((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_gather_copy(void *a, void *b, void *mask,
83 int *n, int *m, cudaStream_t strm) {
84
85 const dim3 nthrds(1024, 1, 1);
86 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
87
89 ((real *) a, (real*) b,(int*) mask, *n, *m);
91
92 }
93
97 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
98 int *n, int *m, cudaStream_t strm) {
99
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
102
104 ((real *) a, (real *) b, (int *) mask, *n, *m);
106
107 }
111 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
112 int *n, int *m, cudaStream_t strm) {
113
114 const dim3 nthrds(1024, 1, 1);
115 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
116
118 ((real *) a, (real*) b,(int*) mask, *n, *m);
120 }
121
122
126 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
128
129 const dim3 nthrds(1024, 1, 1);
130 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
131
133 ((real*)a, *c, *size, mask, *mask_size);
135 }
136
140 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
141 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
142 }
143
147 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
148
149 const dim3 nthrds(1024, 1, 1);
150 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
151
152 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
154
155 }
156
160 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
161
162 const dim3 nthrds(1024, 1, 1);
163 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
164
166 ((real *) a, (real *) b, *c, *n);
168
169 }
170
174 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
175
176 const dim3 nthrds(1024, 1, 1);
177 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
178
179 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
181
182 }
183
187 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
188
189 const dim3 nthrds(1024, 1, 1);
190 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
191
193 ((real *) a, (real *) b, *c, *n);
195
196 }
197
201 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
202
203 const dim3 nthrds(1024, 1, 1);
204 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
205
206 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
208
209 }
210
215 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
216
217 const dim3 nthrds(1024, 1, 1);
218 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
219
221 ((real *) a, (real *) b, *c, *n);
223
224 }
225
229 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
230
231 const dim3 nthrds(1024, 1, 1);
232 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
233
234 if (*n > 0){
235 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
237 }
238
239 }
240
245 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
246
247 const dim3 nthrds(1024, 1, 1);
248 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
249
251 ((real *) a, (real *) b, *n);
253
254 }
255
260 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
261
262 const dim3 nthrds(1024, 1, 1);
263 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
264
266 ((real *) a, (real *) b, (real *) c, *n);
268 }
269
274 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
276
277 const dim3 nthrds(1024, 1, 1);
278 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
279
281 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
283
284 }
290 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294
296 ((real *) a, (real *) b, *c1, *n);
298
299 }
300
306 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
307
308 const dim3 nthrds(1024, 1, 1);
309 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
310
312 ((real *) a, (real *) b, *c1, *n);
314
315 }
316
323 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
325
326 const dim3 nthrds(1024, 1, 1);
327 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
328
330 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
332
333 }
334
340 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
341
342 const dim3 nthrds(1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
344
346 ((real *) a, (real *) b, *c1, *n);
348
349 }
350
356 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
358
359 const dim3 nthrds(1024, 1, 1);
360 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
361
363 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
365
366 }
367
372 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
373
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
376
377 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
379 }
380
385 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
386
387 const dim3 nthrds(1024, 1, 1);
388 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
389
391 ((real *) a, (real *) b, *n);
393 }
394
399 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
400
401 const dim3 nthrds(1024, 1, 1);
402 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
403
405 ((real *) a, (real *) b, (real *) c, *n);
407 }
408
413 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
414
415 const dim3 nthrds(1024, 1, 1);
416 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
417
418 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
420 }
421
426 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
427
428 const dim3 nthrds(1024, 1, 1);
429 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
430
432 ((real *) a, (real *) b, (real *) c, *n);
434 }
435
440 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
441
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444
446 ((real *) a, (real *) b, (real *) c, *n);
448 }
449
450
455 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
456
457 const dim3 nthrds(1024, 1, 1);
458 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
459
461 ((real *) a, (real *) b, *n);
463 }
464
469 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
470
471 const dim3 nthrds(1024, 1, 1);
472 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
473
475 ((real *) a, (real *) b, (real *) c, *n);
477 }
478
483 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
484
485 const dim3 nthrds(1024, 1, 1);
486 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
487
489 ((real *) a, (real *) b, (real *) c, *n);
491 }
492
497 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
499
500 const dim3 nthrds(1024, 1, 1);
501 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
502
504 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
506 }
507
512 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
513 void *v1, void *v2, void *v3, int *n,
515
516 const dim3 nthrds(1024, 1, 1);
517 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
518
520 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
521 (real *) v1, (real *) v2, (real *) v3, *n);
523 }
524
529 void cuda_vcross(void *u1, void *u2, void *u3,
530 void *v1, void *v2, void *v3,
531 void *w1, void *w2, void *w3,
532 int *n, cudaStream_t strm) {
533
534 const dim3 nthrds(1024, 1, 1);
535 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
536
538 ((real *) u1, (real *) u2, (real *) u3,
539 (real *) v1, (real *) v2, (real *) v3,
540 (real *) w1, (real *) w2, (real *) w3, *n);
542 }
543
544
545 /*
546 * Reduction buffer
547 */
548 int red_s = 0;
550 void * bufred_d = NULL;
551
556 if ( nb >= red_s) {
557 red_s = nb+1;
558 if (bufred != NULL) {
560#ifdef HAVE_NVSHMEM
562#else
564#endif
565 }
567#ifdef HAVE_NVSHMEM
568 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
569#else
571#endif
572 }
573 }
574
579 int n, const cudaStream_t stream) {
580
581#ifdef HAVE_NCCL
583 DEVICE_NCCL_SUM, stream);
585 cudaMemcpyDeviceToHost, stream));
586 cudaStreamSynchronize(stream);
587#elif HAVE_NVSHMEM
588 if (sizeof(real) == sizeof(float)) {
590 (float *) bufred_d,
591 (float *) bufred_d, n, stream);
592 }
593 else if (sizeof(real) == sizeof(double)) {
595 (double *) bufred_d,
596 (double *) bufred_d, n, stream);
597
598 }
600 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
601 cudaStreamSynchronize(stream);
602#elif HAVE_DEVICE_MPI
603 cudaStreamSynchronize(stream);
605#else
607 cudaMemcpyDeviceToHost, stream));
608 cudaStreamSynchronize(stream);
609#endif
610 }
611
612
613
618 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
619
620 const dim3 nthrds(1024, 1, 1);
621 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
622 const int nb = ((*n) + 1024 - 1)/ 1024;
623
625
626 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
627 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
629 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
631
633 cudaMemcpyDeviceToHost, stream));
634 cudaStreamSynchronize(stream);
635
636 return bufred[0];
637 }
638
639
640
641
646 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
647
648 const dim3 nthrds(1024, 1, 1);
649 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
650 const int nb = ((*n) + 1024 - 1)/ 1024;
651
653
654 if ( *n > 0) {
655 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
656 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
658 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
660 }
661 else {
662 cuda_rzero(bufred_d, &red_s, stream);
663 }
665
666 return bufred[0];
667 }
668
673 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
674 cudaStream_t stream){
675 int pow2 = 1;
676 while(pow2 < (*j)){
677 pow2 = 2*pow2;
678 }
679 const int nt = 1024/pow2;
680 const dim3 nthrds(pow2, nt, 1);
681 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
682 const int nb = ((*n) + nt - 1)/nt;
683
685
686 if ( *n > 0) {
687 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
688 ((const real *) w, (const real **) v,
689 (const real *)mult, (real *)bufred_d, *j, *n);
692 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
694 }
695 else {
696 cuda_rzero(bufred_d, &red_s, stream);
697 }
698 cuda_global_reduce_add(h, bufred_d, (*j), stream);
699 }
700
705 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
706
707 const dim3 nthrds(1024, 1, 1);
708 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
709 const int nb = ((*n) + 1024 - 1)/ 1024;
710
712
713 if ( *n > 0) {
715 <<<nblcks, nthrds, 0, stream>>>((real *) a,
716 (real *) b,
717 (real *) bufred_d, *n);
719 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
721 }
722 else {
723 cuda_rzero(bufred_d, &red_s, stream);
724 }
726
727 return bufred[0];
728 }
729
734 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
735
736 const dim3 nthrds(1024, 1, 1);
737 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
738 const int nb = ((*n) + 1024 - 1)/ 1024;
739
741
742 if ( *n > 0) {
744 <<<nblcks, nthrds, 0, stream>>>((real *) a,
745 (real *) b,
746 (real *) bufred_d, *n);
748 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
750 }
751 else {
752 cuda_rzero(bufred_d, &red_s, stream);
753 }
755
756 return bufred[0];
757 }
758
763 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
764 const dim3 nthrds(1024, 1, 1);
765 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
766 const int nb = ((*n) + 1024 - 1)/ 1024;
767
769 if ( *n > 0) {
771 <<<nblcks, nthrds, 0, stream>>>((real *) a,
772 (real *) bufred_d, *n);
774 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
776 }
777 else {
778 cuda_rzero(bufred_d, &red_s, stream);
779 }
780
782
783 return bufred[0];
784 }
785
790 void cuda_absval(void *a, int *n, cudaStream_t stream) {
791
792 const dim3 nthrds(1024, 1, 1);
793 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
794
796 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
798
799 }
800
801 // ======================================================================== //
802 // Point-wise operations.
803
808 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
809
810 const dim3 nthrds(1024, 1, 1);
811 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
812
813 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
814 ((real *)a, (real *)b, *n);
816 }
817
822 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
823
824 const dim3 nthrds(1024, 1, 1);
825 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
826
827 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
828 ((real *)a, (real *)b, (real *)c, *n);
830 }
831
836 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
837
838 const dim3 nthrds(1024, 1, 1);
839 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
840
841 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
842 ((real *)a, *c, *n);
844 }
845
850 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
851
852 const dim3 nthrds(1024, 1, 1);
853 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
854
855 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
856 ((real *)a, (real *)b, *c, *n);
858 }
859
864 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
865
866 const dim3 nthrds(1024, 1, 1);
867 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
868
869 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
870 ((real *)a, (real *)b, *n);
872 }
873
878 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
879
880 const dim3 nthrds(1024, 1, 1);
881 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
882
883 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
884 ((real *)a, (real *)b, (real *)c, *n);
886 }
887
892 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
893
894 const dim3 nthrds(1024, 1, 1);
895 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
896
897 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
899 }
900
905 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
906
907 const dim3 nthrds(1024, 1, 1);
908 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
909
910 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
911 ((real *)a, (real *)b, *c, *n);
913 }
914
915 // ======================================================================== //
916
920 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
921
922 const dim3 nthrds(1024, 1, 1);
923 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
924
925 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
926 ((int *) a, *c, *n);
928
929 }
930
931} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:850
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:497
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:306
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:734
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:512
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:356
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:578
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:705
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:483
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:864
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:822
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:160
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:290
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:426
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:323
void * bufred_d
Definition math.cu:550
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:469
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:126
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:385
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:836
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:67
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:413
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:111
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:245
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:555
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:147
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:274
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:455
real * bufred
Definition math.cu:549
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:399
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:529
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:82
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:763
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:646
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:618
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:905
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:215
int red_s
Definition math.cu:548
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:340
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:260
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:892
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:229
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:673
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:790
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:140
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:187
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:174
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:920
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:808
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:97
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:372
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:201
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:440
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:878
Object for handling masks in Neko.
Definition mask.f90:34