Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
67 void cuda_masked_copy(void *a, void *b, void *mask,
68 int *n, int *m, cudaStream_t strm) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 ((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_gather_copy(void *a, void *b, void *mask,
83 int *n, int *m, cudaStream_t strm) {
84
85 const dim3 nthrds(1024, 1, 1);
86 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
87
89 ((real *) a, (real*) b,(int*) mask, *n, *m);
91
92 }
93
97 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
98 int *n, int *m, cudaStream_t strm) {
99
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
102
104 ((real *) a, (real *) b, (int *) mask, *n, *m);
106
107 }
111 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
112 int *n, int *m, cudaStream_t strm) {
113
114 const dim3 nthrds(1024, 1, 1);
115 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
116
118 ((real *) a, (real*) b,(int*) mask, *n, *m);
120 }
121
122
126 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
128
129 const dim3 nthrds(1024, 1, 1);
130 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
131
133 ((real*)a, *c, *size, mask, *mask_size);
135 }
136
140 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
141 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
142 }
143
147 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
148
149 const dim3 nthrds(1024, 1, 1);
150 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
151
152 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
154
155 }
156
160 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
161
162 const dim3 nthrds(1024, 1, 1);
163 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
164
166 ((real *) a, (real *) b, *c, *n);
168
169 }
170
174 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
175
176 const dim3 nthrds(1024, 1, 1);
177 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
178
179 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
181
182 }
183
187 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
188
189 const dim3 nthrds(1024, 1, 1);
190 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
191
193 ((real *) a, (real *) b, *c, *n);
195
196 }
197
201 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
202
203 const dim3 nthrds(1024, 1, 1);
204 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
205
206 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
208
209 }
210
215 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
216
217 const dim3 nthrds(1024, 1, 1);
218 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
219
221 ((real *) a, (real *) b, *c, *n);
223
224 }
225
229 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
230
231 const dim3 nthrds(1024, 1, 1);
232 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
233
234 if (*n > 0){
235 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
237 }
238
239 }
240
245 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
246
247 const dim3 nthrds(1024, 1, 1);
248 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
249
251 ((real *) a, (real *) b, *n);
253
254 }
255
260 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
261
262 const dim3 nthrds(1024, 1, 1);
263 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
264
266 ((real *) a, (real *) b, (real *) c, *n);
268 }
269
274 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
276
277 const dim3 nthrds(1024, 1, 1);
278 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
279
281 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
283
284 }
290 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294
296 ((real *) a, (real *) b, *c1, *n);
298
299 }
300
306 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
307
308 const dim3 nthrds(1024, 1, 1);
309 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
310
312 ((real *) a, (real *) b, *c1, *n);
314
315 }
316
323 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
325
326 const dim3 nthrds(1024, 1, 1);
327 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
328
330 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
332
333 }
334
340 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
341
342 const dim3 nthrds(1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
344
346 ((real *) a, (real *) b, *c1, *n);
348
349 }
350
356 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
358
359 const dim3 nthrds(1024, 1, 1);
360 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
361
363 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
365
366 }
367
373 void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
374 real *c3, int *n, cudaStream_t strm) {
375
376 const dim3 nthrds(1024, 1, 1);
377 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
378
380 ((real *) a, (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
382
383 }
384
390 void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
391 real *c2, real *c3, real *c4, int *n, cudaStream_t strm) {
392
393 const dim3 nthrds(1024, 1, 1);
394 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
395
397 ((real *) a, (real *) b, (real *) c, (real *) d, (real *) e,
398 *c1, *c2, *c3, *c4, *n);
400
401 }
402
407 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
408
409 const dim3 nthrds(1024, 1, 1);
410 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
411
412 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
414 }
415
420 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
421
422 const dim3 nthrds(1024, 1, 1);
423 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
424
426 ((real *) a, (real *) b, *n);
428 }
429
434 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
435
436 const dim3 nthrds(1024, 1, 1);
437 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
438
440 ((real *) a, (real *) b, (real *) c, *n);
442 }
443
448 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
449
450 const dim3 nthrds(1024, 1, 1);
451 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
452
453 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
455 }
456
461 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
462
463 const dim3 nthrds(1024, 1, 1);
464 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
465
467 ((real *) a, (real *) b, (real *) c, *n);
469 }
470
475 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
476
477 const dim3 nthrds(1024, 1, 1);
478 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
479
481 ((real *) a, (real *) b, (real *) c, *n);
483 }
484
485
490 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
491
492 const dim3 nthrds(1024, 1, 1);
493 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
494
496 ((real *) a, (real *) b, *n);
498 }
499
504 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
505
506 const dim3 nthrds(1024, 1, 1);
507 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
508
510 ((real *) a, (real *) b, (real *) c, *n);
512 }
513
518 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
519
520 const dim3 nthrds(1024, 1, 1);
521 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
522
524 ((real *) a, (real *) b, (real *) c, *n);
526 }
527
532 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
534
535 const dim3 nthrds(1024, 1, 1);
536 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
537
539 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
541 }
542
547 void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n,
549
550 const dim3 nthrds(1024, 1, 1);
551 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
552
554 ((real *) a, (real *) b, (real *) c, *s, *n);
556 }
557
562 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
563 void *v1, void *v2, void *v3, int *n,
565
566 const dim3 nthrds(1024, 1, 1);
567 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
568
570 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
571 (real *) v1, (real *) v2, (real *) v3, *n);
573 }
574
579 void cuda_vcross(void *u1, void *u2, void *u3,
580 void *v1, void *v2, void *v3,
581 void *w1, void *w2, void *w3,
582 int *n, cudaStream_t strm) {
583
584 const dim3 nthrds(1024, 1, 1);
585 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
586
588 ((real *) u1, (real *) u2, (real *) u3,
589 (real *) v1, (real *) v2, (real *) v3,
590 (real *) w1, (real *) w2, (real *) w3, *n);
592 }
593
594
595 /*
596 * Reduction buffer
597 */
598 int red_s = 0;
600 void * bufred_d = NULL;
601
606 if ( nb >= red_s) {
607 red_s = nb+1;
608 if (bufred != NULL) {
610#ifdef HAVE_NVSHMEM
612#else
614#endif
615 }
617#ifdef HAVE_NVSHMEM
618 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
619#else
621#endif
622 }
623 }
624
629 int n, const cudaStream_t stream) {
630
631#ifdef HAVE_NCCL
633 DEVICE_NCCL_SUM, stream);
635 cudaMemcpyDeviceToHost, stream));
636 cudaStreamSynchronize(stream);
637#elif HAVE_NVSHMEM
638 if (sizeof(real) == sizeof(float)) {
640 (float *) bufred_d,
641 (float *) bufred_d, n, stream);
642 }
643 else if (sizeof(real) == sizeof(double)) {
645 (double *) bufred_d,
646 (double *) bufred_d, n, stream);
647
648 }
650 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
651 cudaStreamSynchronize(stream);
652#elif HAVE_DEVICE_MPI
653 cudaStreamSynchronize(stream);
655#else
657 cudaMemcpyDeviceToHost, stream));
658 cudaStreamSynchronize(stream);
659#endif
660 }
661
662
663
668 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
669
670 const dim3 nthrds(1024, 1, 1);
671 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
672 const int nb = ((*n) + 1024 - 1)/ 1024;
673
675
676 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
677 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
679 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
681
683 cudaMemcpyDeviceToHost, stream));
684 cudaStreamSynchronize(stream);
685
686 return bufred[0];
687 }
688
689
690
691
696 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
697
698 const dim3 nthrds(1024, 1, 1);
699 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
700 const int nb = ((*n) + 1024 - 1)/ 1024;
701
703
704 if ( *n > 0) {
705 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
706 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
708 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
710 }
711 else {
712 cuda_rzero(bufred_d, &red_s, stream);
713 }
715
716 return bufred[0];
717 }
718
723 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
724 cudaStream_t stream){
725 int pow2 = 1;
726 while(pow2 < (*j)){
727 pow2 = 2*pow2;
728 }
729 const int nt = 1024/pow2;
730 const dim3 nthrds(pow2, nt, 1);
731 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
732 const int nb = ((*n) + nt - 1)/nt;
733
735
736 if ( *n > 0) {
737 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
738 ((const real *) w, (const real **) v,
739 (const real *)mult, (real *)bufred_d, *j, *n);
742 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
744 }
745 else {
746 cuda_rzero(bufred_d, &red_s, stream);
747 }
748 cuda_global_reduce_add(h, bufred_d, (*j), stream);
749 }
750
755 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
756
757 const dim3 nthrds(1024, 1, 1);
758 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
759 const int nb = ((*n) + 1024 - 1)/ 1024;
760
762
763 if ( *n > 0) {
765 <<<nblcks, nthrds, 0, stream>>>((real *) a,
766 (real *) b,
767 (real *) bufred_d, *n);
769 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
771 }
772 else {
773 cuda_rzero(bufred_d, &red_s, stream);
774 }
776
777 return bufred[0];
778 }
779
784 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
785
786 const dim3 nthrds(1024, 1, 1);
787 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
788 const int nb = ((*n) + 1024 - 1)/ 1024;
789
791
792 if ( *n > 0) {
794 <<<nblcks, nthrds, 0, stream>>>((real *) a,
795 (real *) b,
796 (real *) bufred_d, *n);
798 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
800 }
801 else {
802 cuda_rzero(bufred_d, &red_s, stream);
803 }
805
806 return bufred[0];
807 }
808
813 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
814 const dim3 nthrds(1024, 1, 1);
815 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
816 const int nb = ((*n) + 1024 - 1)/ 1024;
817
819 if ( *n > 0) {
821 <<<nblcks, nthrds, 0, stream>>>((real *) a,
822 (real *) bufred_d, *n);
824 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
826 }
827 else {
828 cuda_rzero(bufred_d, &red_s, stream);
829 }
830
832
833 return bufred[0];
834 }
835
840 void cuda_absval(void *a, int *n, cudaStream_t stream) {
841
842 const dim3 nthrds(1024, 1, 1);
843 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
844
846 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
848
849 }
850
851 // ======================================================================== //
852 // Point-wise operations.
853
858 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
859
860 const dim3 nthrds(1024, 1, 1);
861 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
862
863 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
864 ((real *)a, (real *)b, *n);
866 }
867
872 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
873
874 const dim3 nthrds(1024, 1, 1);
875 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
876
877 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
878 ((real *)a, (real *)b, (real *)c, *n);
880 }
881
886 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
887
888 const dim3 nthrds(1024, 1, 1);
889 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
890
891 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
892 ((real *)a, *c, *n);
894 }
895
900 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
901
902 const dim3 nthrds(1024, 1, 1);
903 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
904
905 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
906 ((real *)a, (real *)b, *c, *n);
908 }
909
914 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
915
916 const dim3 nthrds(1024, 1, 1);
917 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
918
919 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
920 ((real *)a, (real *)b, *n);
922 }
923
928 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
929
930 const dim3 nthrds(1024, 1, 1);
931 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
932
933 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
934 ((real *)a, (real *)b, (real *)c, *n);
936 }
937
942 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
943
944 const dim3 nthrds(1024, 1, 1);
945 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
946
947 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
949 }
950
955 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
956
957 const dim3 nthrds(1024, 1, 1);
958 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
959
960 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
961 ((real *)a, (real *)b, *c, *n);
963 }
964
965 // ======================================================================== //
966
970 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
971
972 const dim3 nthrds(1024, 1, 1);
973 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
974
975 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
976 ((int *) a, *c, *n);
978
979 }
980
981} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:900
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:532
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:306
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:784
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:562
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:356
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:628
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:755
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:518
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:914
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:872
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:160
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:290
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:461
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:323
void * bufred_d
Definition math.cu:600
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:504
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:126
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:420
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:886
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:67
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:448
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:111
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:245
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:605
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:147
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:274
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:490
real * bufred
Definition math.cu:599
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:434
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:579
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:82
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:813
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:696
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:668
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:955
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:215
int red_s
Definition math.cu:598
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:340
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:260
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
Definition math.cu:547
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:942
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:229
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:723
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:840
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:140
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:187
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:174
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:970
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:858
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:97
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
Definition math.cu:390
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:407
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:201
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
Definition math.cu:373
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:475
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:928
Object for handling masks in Neko.
Definition mask.f90:34