Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
67 void cuda_masked_copy(void *a, void *b, void *mask,
68 int *n, int *m, cudaStream_t strm) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 ((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_gather_copy(void *a, void *b, void *mask,
83 int *n, int *m, cudaStream_t strm) {
84
85 const dim3 nthrds(1024, 1, 1);
86 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
87
89 ((real *) a, (real*) b,(int*) mask, *n, *m);
91
92 }
93
97 void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask,
98 int *n, int *m, cudaStream_t strm) {
99
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
102
104 ((real *) a, (real*) b,(int*) mask, *n, *m);
106
107 }
108
112 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
113 int *n, int *m, cudaStream_t strm) {
114
115 const dim3 nthrds(1024, 1, 1);
116 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
117
119 ((real *) a, (real *) b, (int *) mask, *n, *m);
121
122 }
126 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
127 int *n, int *m, cudaStream_t strm) {
128
129 const dim3 nthrds(1024, 1, 1);
130 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
131
133 ((real *) a, (real*) b,(int*) mask, *n, *m);
135 }
136
137
141 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
143
144 const dim3 nthrds(1024, 1, 1);
145 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
146
148 ((real*)a, *c, *size, mask, *mask_size);
150 }
151
155 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
156 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
157 }
158
162 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
163
164 const dim3 nthrds(1024, 1, 1);
165 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
166
167 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
169
170 }
171
175 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
176
177 const dim3 nthrds(1024, 1, 1);
178 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
179
181 ((real *) a, (real *) b, *c, *n);
183
184 }
185
189 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
190
191 const dim3 nthrds(1024, 1, 1);
192 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
193
194 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
196
197 }
198
202 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
203
204 const dim3 nthrds(1024, 1, 1);
205 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
206
208 ((real *) a, (real *) b, *c, *n);
210
211 }
212
216 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
217
218 const dim3 nthrds(1024, 1, 1);
219 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
220
221 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
223
224 }
225
230 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
231
232 const dim3 nthrds(1024, 1, 1);
233 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
234
236 ((real *) a, (real *) b, *c, *n);
238
239 }
240
244 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
245
246 const dim3 nthrds(1024, 1, 1);
247 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
248
249 if (*n > 0){
250 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
252 }
253
254 }
255
260 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
261
262 const dim3 nthrds(1024, 1, 1);
263 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
264
266 ((real *) a, (real *) b, *n);
268
269 }
270
275 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
276
277 const dim3 nthrds(1024, 1, 1);
278 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
279
281 ((real *) a, (real *) b, (real *) c, *n);
283 }
284
289 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294
296 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
298
299 }
305 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
306
307 const dim3 nthrds(1024, 1, 1);
308 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
309
311 ((real *) a, (real *) b, *c1, *n);
313
314 }
315
321 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
322
323 const dim3 nthrds(1024, 1, 1);
324 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
325
327 ((real *) a, (real *) b, *c1, *n);
329
330 }
331
338 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
340
341 const dim3 nthrds(1024, 1, 1);
342 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
343
345 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
347
348 }
349
355 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
356
357 const dim3 nthrds(1024, 1, 1);
358 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
359
361 ((real *) a, (real *) b, *c1, *n);
363
364 }
365
371 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
373
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
376
378 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
380
381 }
382
388 void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
389 real *c3, int *n, cudaStream_t strm) {
390
391 const dim3 nthrds(1024, 1, 1);
392 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
393
395 ((real *) a, (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
397
398 }
399
405 void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
406 real *c2, real *c3, real *c4, int *n, cudaStream_t strm) {
407
408 const dim3 nthrds(1024, 1, 1);
409 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
410
412 ((real *) a, (real *) b, (real *) c, (real *) d, (real *) e,
413 *c1, *c2, *c3, *c4, *n);
415
416 }
417
422 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
423
424 const dim3 nthrds(1024, 1, 1);
425 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
426
427 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
429 }
430
435 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
436
437 const dim3 nthrds(1024, 1, 1);
438 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
439
441 ((real *) a, (real *) b, *n);
443 }
444
449 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
450
451 const dim3 nthrds(1024, 1, 1);
452 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
453
455 ((real *) a, (real *) b, (real *) c, *n);
457 }
458
463 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
464
465 const dim3 nthrds(1024, 1, 1);
466 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
467
468 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
470 }
471
476 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
477
478 const dim3 nthrds(1024, 1, 1);
479 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
480
482 ((real *) a, (real *) b, (real *) c, *n);
484 }
485
490 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
491
492 const dim3 nthrds(1024, 1, 1);
493 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
494
496 ((real *) a, (real *) b, (real *) c, *n);
498 }
499
500
505 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
506
507 const dim3 nthrds(1024, 1, 1);
508 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
509
511 ((real *) a, (real *) b, *n);
513 }
514
519 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
520
521 const dim3 nthrds(1024, 1, 1);
522 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
523
525 ((real *) a, (real *) b, (real *) c, *n);
527 }
528
533 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
534
535 const dim3 nthrds(1024, 1, 1);
536 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
537
539 ((real *) a, (real *) b, (real *) c, *n);
541 }
542
547 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
549
550 const dim3 nthrds(1024, 1, 1);
551 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
552
554 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
556 }
557
562 void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n,
564
565 const dim3 nthrds(1024, 1, 1);
566 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
567
569 ((real *) a, (real *) b, (real *) c, *s, *n);
571 }
572
577 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
578 void *v1, void *v2, void *v3, int *n,
580
581 const dim3 nthrds(1024, 1, 1);
582 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
583
585 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
586 (real *) v1, (real *) v2, (real *) v3, *n);
588 }
589
594 void cuda_vcross(void *u1, void *u2, void *u3,
595 void *v1, void *v2, void *v3,
596 void *w1, void *w2, void *w3,
597 int *n, cudaStream_t strm) {
598
599 const dim3 nthrds(1024, 1, 1);
600 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
601
603 ((real *) u1, (real *) u2, (real *) u3,
604 (real *) v1, (real *) v2, (real *) v3,
605 (real *) w1, (real *) w2, (real *) w3, *n);
607 }
608
609
610 /*
611 * Reduction buffer
612 */
613 int red_s = 0;
615 void * bufred_d = NULL;
616
621 if ( nb >= red_s) {
622 red_s = nb+1;
623 if (bufred != NULL) {
625#ifdef HAVE_NVSHMEM
627#else
629#endif
630 }
632#ifdef HAVE_NVSHMEM
633 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
634#else
636#endif
637 }
638 }
639
644 int n, const cudaStream_t stream) {
645
646#ifdef HAVE_NCCL
648 DEVICE_NCCL_SUM, stream);
650 cudaMemcpyDeviceToHost, stream));
651 cudaStreamSynchronize(stream);
652#elif HAVE_NVSHMEM
653 if (sizeof(real) == sizeof(float)) {
655 (float *) bufred_d,
656 (float *) bufred_d, n, stream);
657 }
658 else if (sizeof(real) == sizeof(double)) {
660 (double *) bufred_d,
661 (double *) bufred_d, n, stream);
662
663 }
665 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
666 cudaStreamSynchronize(stream);
667#elif HAVE_DEVICE_MPI
668 cudaStreamSynchronize(stream);
670#else
672 cudaMemcpyDeviceToHost, stream));
673 cudaStreamSynchronize(stream);
674#endif
675 }
676
681 int n, const cudaStream_t stream) {
682
683#ifdef HAVE_NCCL
685 DEVICE_NCCL_MAX, stream);
687 cudaMemcpyDeviceToHost, stream));
688 cudaStreamSynchronize(stream);
689#elif HAVE_NVSHMEM
690 if (sizeof(real) == sizeof(float)) {
692 (float *) bufred_d,
693 (float *) bufred_d, n, stream);
694 }
695 else if (sizeof(real) == sizeof(double)) {
697 (double *) bufred_d,
698 (double *) bufred_d, n, stream);
699
700 }
702 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
703 cudaStreamSynchronize(stream);
704#elif HAVE_DEVICE_MPI
705 cudaStreamSynchronize(stream);
707#else
709 cudaMemcpyDeviceToHost, stream));
710 cudaStreamSynchronize(stream);
711#endif
712 }
713
718 int n, const cudaStream_t stream) {
719
720#ifdef HAVE_NCCL
722 DEVICE_NCCL_MIN, stream);
724 cudaMemcpyDeviceToHost, stream));
725 cudaStreamSynchronize(stream);
726#elif HAVE_NVSHMEM
727 if (sizeof(real) == sizeof(float)) {
729 (float *) bufred_d,
730 (float *) bufred_d, n, stream);
731 }
732 else if (sizeof(real) == sizeof(double)) {
734 (double *) bufred_d,
735 (double *) bufred_d, n, stream);
736
737 }
739 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
740 cudaStreamSynchronize(stream);
741#elif HAVE_DEVICE_MPI
742 cudaStreamSynchronize(stream);
744#else
746 cudaMemcpyDeviceToHost, stream));
747 cudaStreamSynchronize(stream);
748#endif
749 }
750
751
756 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
757
758 const dim3 nthrds(1024, 1, 1);
759 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
760 const int nb = ((*n) + 1024 - 1)/ 1024;
761
763
764 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
765 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
767 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
769
771 cudaMemcpyDeviceToHost, stream));
772 cudaStreamSynchronize(stream);
773
774 return bufred[0];
775 }
776
777
778
779
784 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
785
786 const dim3 nthrds(1024, 1, 1);
787 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
788 const int nb = ((*n) + 1024 - 1)/ 1024;
789
791
792 if ( *n > 0) {
793 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
794 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
796 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
798 }
799 else {
800 cuda_rzero(bufred_d, &red_s, stream);
801 }
803
804 return bufred[0];
805 }
806
811 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
812 cudaStream_t stream){
813 int pow2 = 1;
814 while(pow2 < (*j)){
815 pow2 = 2*pow2;
816 }
817 const int nt = 1024/pow2;
818 const dim3 nthrds(pow2, nt, 1);
819 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
820 const int nb = ((*n) + nt - 1)/nt;
821
823
824 if ( *n > 0) {
825 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
826 ((const real *) w, (const real **) v,
827 (const real *)mult, (real *)bufred_d, *j, *n);
830 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
832 }
833 else {
834 cuda_rzero(bufred_d, &red_s, stream);
835 }
836 cuda_global_reduce_add(h, bufred_d, (*j), stream);
837 }
838
843 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
844
845 const dim3 nthrds(1024, 1, 1);
846 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
847 const int nb = ((*n) + 1024 - 1)/ 1024;
848
850
851 if ( *n > 0) {
853 <<<nblcks, nthrds, 0, stream>>>((real *) a,
854 (real *) b,
855 (real *) bufred_d, *n);
857 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
859 }
860 else {
861 cuda_rzero(bufred_d, &red_s, stream);
862 }
864
865 return bufred[0];
866 }
867
872 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
873
874 const dim3 nthrds(1024, 1, 1);
875 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
876 const int nb = ((*n) + 1024 - 1)/ 1024;
877
879
880 if ( *n > 0) {
882 <<<nblcks, nthrds, 0, stream>>>((real *) a,
883 (real *) b,
884 (real *) bufred_d, *n);
886 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
888 }
889 else {
890 cuda_rzero(bufred_d, &red_s, stream);
891 }
893
894 return bufred[0];
895 }
896
901 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
902 const dim3 nthrds(1024, 1, 1);
903 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
904 const int nb = ((*n) + 1024 - 1)/ 1024;
905
907 if ( *n > 0) {
909 <<<nblcks, nthrds, 0, stream>>>((real *) a,
910 (real *) bufred_d, *n);
912 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
914 }
915 else {
916 cuda_rzero(bufred_d, &red_s, stream);
917 }
918
920
921 return bufred[0];
922 }
923
928 real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream) {
929 const dim3 nthrds(1024, 1, 1);
930 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
931 const int nb = ((*n) + 1024 - 1)/ 1024;
932
934 if ( *n > 0) {
936 <<<nblcks, nthrds, 0, stream>>>((real *) a, *ninf,
937 (real *) bufred_d, *n);
939 reduce_max_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
940 *ninf, nb);
942 }
943 else {
944 cuda_rzero(bufred_d, &red_s, stream);
945 }
946
948
949 return bufred[0];
950 }
951
956 real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream) {
957 const dim3 nthrds(1024, 1, 1);
958 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
959 const int nb = ((*n) + 1024 - 1)/ 1024;
960
962 if ( *n > 0) {
964 <<<nblcks, nthrds, 0, stream>>>((real *) a, *pinf,
965 (real *) bufred_d, *n);
967 reduce_min_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
968 *pinf, nb);
970 }
971 else {
972 cuda_rzero(bufred_d, &red_s, stream);
973 }
974
976
977 return bufred[0];
978 }
979
984 void cuda_absval(void *a, int *n, cudaStream_t stream) {
985
986 const dim3 nthrds(1024, 1, 1);
987 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
988
990 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
992
993 }
994
995 // ======================================================================== //
996 // Point-wise operations.
997
1002 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1003
1004 const dim3 nthrds(1024, 1, 1);
1005 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1006
1007 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1008 ((real *)a, (real *)b, *n);
1010 }
1011
1016 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1017
1018 const dim3 nthrds(1024, 1, 1);
1019 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1020
1021 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1022 ((real *)a, (real *)b, (real *)c, *n);
1024 }
1025
1030 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1031
1032 const dim3 nthrds(1024, 1, 1);
1033 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1034
1035 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1036 ((real *)a, *c, *n);
1038 }
1039
1044 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1045
1046 const dim3 nthrds(1024, 1, 1);
1047 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1048
1049 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1050 ((real *)a, (real *)b, *c, *n);
1052 }
1053
1058 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1059
1060 const dim3 nthrds(1024, 1, 1);
1061 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1062
1063 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1064 ((real *)a, (real *)b, *n);
1066 }
1067
1072 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1073
1074 const dim3 nthrds(1024, 1, 1);
1075 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1076
1077 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1078 ((real *)a, (real *)b, (real *)c, *n);
1080 }
1081
1086 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1087
1088 const dim3 nthrds(1024, 1, 1);
1089 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1090
1091 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
1093 }
1094
1099 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1100
1101 const dim3 nthrds(1024, 1, 1);
1102 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1103
1104 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1105 ((real *)a, (real *)b, *c, *n);
1107 }
1108
1109 // ======================================================================== //
1110
1114 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
1115
1116 const dim3 nthrds(1024, 1, 1);
1117 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1118
1119 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
1120 ((int *) a, *c, *n);
1122
1123 }
1124
1125} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_MAX
#define DEVICE_MPI_MIN
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_MAX
#define DEVICE_NCCL_MIN
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_global_reduce_min(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:717
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1044
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:547
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:321
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:872
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:577
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:371
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:643
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:843
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:533
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1058
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1016
real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream)
Definition math.cu:956
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:175
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:305
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:476
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:338
void * bufred_d
Definition math.cu:615
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:519
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:141
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:435
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1030
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:67
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:463
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:126
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:260
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:620
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:162
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:289
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:505
real * bufred
Definition math.cu:614
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:449
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:594
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:82
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:901
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:784
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:756
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1099
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:230
int red_s
Definition math.cu:613
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:355
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:275
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
Definition math.cu:562
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1086
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:244
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:811
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:984
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:155
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:202
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:189
void cuda_global_reduce_max(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:680
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:1114
void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:97
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1002
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:112
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
Definition math.cu:405
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:422
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:216
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
Definition math.cu:388
real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream)
Definition math.cu:928
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:490
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1072
Object for handling masks in Neko.
Definition mask.f90:34