Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
67 void cuda_masked_copy(void *a, void *b, void *mask,
68 int *n, int *m, cudaStream_t strm) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 ((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_gather_copy(void *a, void *b, void *mask,
83 int *n, int *m, cudaStream_t strm) {
84
85 const dim3 nthrds(1024, 1, 1);
86 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
87
89 ((real *) a, (real*) b,(int*) mask, *n, *m);
91
92 }
93
97 void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask,
98 int *n, int *m, cudaStream_t strm) {
99
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
102
104 ((real *) a, (real*) b,(int*) mask, *n, *m);
106
107 }
108
112 void cuda_face_masked_gather_copy(void *a, void *b, void *mask,
113 void *facet, int *n1, int *n2, int *lx,
114 int *ly, int *lz, int *m,
116
117 const dim3 nthrds(1024, 1, 1);
118 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
119
121 ((real *) a, (real *) b, (int *) mask, (int *) facet, *n1, *n2, *lx,
122 *ly, *lz, *m);
124
125 }
126
130 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
131 int *n, int *m, cudaStream_t strm) {
132
133 const dim3 nthrds(1024, 1, 1);
134 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
135
137 ((real *) a, (real *) b, (int *) mask, *n, *m);
139
140 }
144 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
145 int *n, int *m, cudaStream_t strm) {
146
147 const dim3 nthrds(1024, 1, 1);
148 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
149
151 ((real *) a, (real*) b,(int*) mask, *n, *m);
153 }
154
158 void cuda_masked_scatter_copy_aligned(void *a, void *b, void *mask,
159 int *n, int *m, cudaStream_t strm) {
160
161 const dim3 nthrds(1024, 1, 1);
162 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
163
165 ((real *) a, (real*) b,(int*) mask, *n, *m);
167 }
168
169
173 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
175
176 const dim3 nthrds(1024, 1, 1);
177 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
178
180 ((real*)a, *c, *size, mask, *mask_size);
182 }
183
187 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
188 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
189 }
190
194 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
195
196 const dim3 nthrds(1024, 1, 1);
197 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
198
199 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
201
202 }
203
207 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
208
209 const dim3 nthrds(1024, 1, 1);
210 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
211
213 ((real *) a, (real *) b, *c, *n);
215
216 }
217
221 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
222
223 const dim3 nthrds(1024, 1, 1);
224 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
225
226 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
228
229 }
230
234 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
235
236 const dim3 nthrds(1024, 1, 1);
237 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
238
240 ((real *) a, (real *) b, *c, *n);
242
243 }
244
248 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
249
250 const dim3 nthrds(1024, 1, 1);
251 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
252
253 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
255
256 }
257
262 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
263
264 const dim3 nthrds(1024, 1, 1);
265 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
266
268 ((real *) a, (real *) b, *c, *n);
270
271 }
272
277 void cuda_cwrap(void *a, real *min_val, real *max_val, int *n,
279
280 const dim3 nthrds(1024, 1, 1);
281 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
282
284 ((real *) a, *min_val, *max_val, *n);
286
287 }
288
292 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
293
294 const dim3 nthrds(1024, 1, 1);
295 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
296
297 if (*n > 0){
298 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
300 }
301
302 }
303
308 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
309
310 const dim3 nthrds(1024, 1, 1);
311 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
312
314 ((real *) a, (real *) b, *n);
316
317 }
318
323 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
324
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327
329 ((real *) a, (real *) b, (real *) c, *n);
331 }
332
337 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
339
340 const dim3 nthrds(1024, 1, 1);
341 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
342
344 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
346
347 }
353 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
354
355 const dim3 nthrds(1024, 1, 1);
356 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
357
359 ((real *) a, (real *) b, *c1, *n);
361
362 }
363
369 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
370
371 const dim3 nthrds(1024, 1, 1);
372 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
373
375 ((real *) a, (real *) b, *c1, *n);
377
378 }
379
386 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
388
389 const dim3 nthrds(1024, 1, 1);
390 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
391
393 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
395
396 }
397
403 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
404
405 const dim3 nthrds(1024, 1, 1);
406 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
407
409 ((real *) a, (real *) b, *c1, *n);
411
412 }
413
419 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
421
422 const dim3 nthrds(1024, 1, 1);
423 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
424
426 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
428
429 }
430
436 void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
437 real *c3, int *n, cudaStream_t strm) {
438
439 const dim3 nthrds(1024, 1, 1);
440 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
441
443 ((real *) a, (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
445
446 }
447
453 void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
454 real *c2, real *c3, real *c4, int *n, cudaStream_t strm) {
455
456 const dim3 nthrds(1024, 1, 1);
457 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
458
460 ((real *) a, (real *) b, (real *) c, (real *) d, (real *) e,
461 *c1, *c2, *c3, *c4, *n);
463
464 }
465
470 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
471
472 const dim3 nthrds(1024, 1, 1);
473 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
474
475 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
477 }
478
483 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
484
485 const dim3 nthrds(1024, 1, 1);
486 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
487
489 ((real *) a, (real *) b, *n);
491 }
492
497 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
498
499 const dim3 nthrds(1024, 1, 1);
500 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
501
503 ((real *) a, (real *) b, (real *) c, *n);
505 }
506
511 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
512
513 const dim3 nthrds(1024, 1, 1);
514 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
515
516 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
518 }
519
524 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
525
526 const dim3 nthrds(1024, 1, 1);
527 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
528
530 ((real *) a, (real *) b, (real *) c, *n);
532 }
533
538 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
539
540 const dim3 nthrds(1024, 1, 1);
541 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
542
544 ((real *) a, (real *) b, (real *) c, *n);
546 }
547
548
553 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
554
555 const dim3 nthrds(1024, 1, 1);
556 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
557
559 ((real *) a, (real *) b, *n);
561 }
562
567 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
568
569 const dim3 nthrds(1024, 1, 1);
570 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
571
573 ((real *) a, (real *) b, (real *) c, *n);
575 }
576
581 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
582
583 const dim3 nthrds(1024, 1, 1);
584 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
585
587 ((real *) a, (real *) b, (real *) c, *n);
589 }
590
595 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
597
598 const dim3 nthrds(1024, 1, 1);
599 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
600
602 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
604 }
605
610 void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n,
612
613 const dim3 nthrds(1024, 1, 1);
614 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
615
617 ((real *) a, (real *) b, (real *) c, *s, *n);
619 }
620
625 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
626 void *v1, void *v2, void *v3, int *n,
628
629 const dim3 nthrds(1024, 1, 1);
630 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
631
633 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
634 (real *) v1, (real *) v2, (real *) v3, *n);
636 }
637
642 void cuda_vcross(void *u1, void *u2, void *u3,
643 void *v1, void *v2, void *v3,
644 void *w1, void *w2, void *w3,
645 int *n, cudaStream_t strm) {
646
647 const dim3 nthrds(1024, 1, 1);
648 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
649
651 ((real *) u1, (real *) u2, (real *) u3,
652 (real *) v1, (real *) v2, (real *) v3,
653 (real *) w1, (real *) w2, (real *) w3, *n);
655 }
656
657
658 /*
659 * Reduction buffer
660 */
661 int red_s = 0;
663 void * bufred_d = NULL;
664
669 if ( nb >= red_s) {
670 red_s = nb+1;
671 if (bufred != NULL) {
673#ifdef HAVE_NVSHMEM
675#else
677#endif
678 }
680#ifdef HAVE_NVSHMEM
681 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
682#else
684#endif
685 }
686 }
687
692 int n, const cudaStream_t stream) {
693
694#ifdef HAVE_NCCL
696 DEVICE_NCCL_SUM, stream);
698 cudaMemcpyDeviceToHost, stream));
699 cudaStreamSynchronize(stream);
700#elif HAVE_NVSHMEM
701 if (sizeof(real) == sizeof(float)) {
703 (float *) bufred_d,
704 (float *) bufred_d, n, stream);
705 }
706 else if (sizeof(real) == sizeof(double)) {
708 (double *) bufred_d,
709 (double *) bufred_d, n, stream);
710
711 }
713 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
714 cudaStreamSynchronize(stream);
715#elif HAVE_DEVICE_MPI
716 cudaStreamSynchronize(stream);
718#else
720 cudaMemcpyDeviceToHost, stream));
721 cudaStreamSynchronize(stream);
722#endif
723 }
724
729 int n, const cudaStream_t stream) {
730
731#ifdef HAVE_NCCL
733 DEVICE_NCCL_MAX, stream);
735 cudaMemcpyDeviceToHost, stream));
736 cudaStreamSynchronize(stream);
737#elif HAVE_NVSHMEM
738 if (sizeof(real) == sizeof(float)) {
740 (float *) bufred_d,
741 (float *) bufred_d, n, stream);
742 }
743 else if (sizeof(real) == sizeof(double)) {
745 (double *) bufred_d,
746 (double *) bufred_d, n, stream);
747
748 }
750 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
751 cudaStreamSynchronize(stream);
752#elif HAVE_DEVICE_MPI
753 cudaStreamSynchronize(stream);
755#else
757 cudaMemcpyDeviceToHost, stream));
758 cudaStreamSynchronize(stream);
759#endif
760 }
761
766 int n, const cudaStream_t stream) {
767
768#ifdef HAVE_NCCL
770 DEVICE_NCCL_MIN, stream);
772 cudaMemcpyDeviceToHost, stream));
773 cudaStreamSynchronize(stream);
774#elif HAVE_NVSHMEM
775 if (sizeof(real) == sizeof(float)) {
777 (float *) bufred_d,
778 (float *) bufred_d, n, stream);
779 }
780 else if (sizeof(real) == sizeof(double)) {
782 (double *) bufred_d,
783 (double *) bufred_d, n, stream);
784
785 }
787 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
788 cudaStreamSynchronize(stream);
789#elif HAVE_DEVICE_MPI
790 cudaStreamSynchronize(stream);
792#else
794 cudaMemcpyDeviceToHost, stream));
795 cudaStreamSynchronize(stream);
796#endif
797 }
798
799
804 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
805
806 const dim3 nthrds(1024, 1, 1);
807 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
808 const int nb = ((*n) + 1024 - 1)/ 1024;
809
811
812 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
813 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
815 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
817
819 cudaMemcpyDeviceToHost, stream));
820 cudaStreamSynchronize(stream);
821
822 return bufred[0];
823 }
824
825
826
827
832 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
833
834 const dim3 nthrds(1024, 1, 1);
835 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
836 const int nb = ((*n) + 1024 - 1)/ 1024;
837
839
840 if ( *n > 0) {
841 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
842 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
844 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
846 }
847 else {
848 cuda_rzero(bufred_d, &red_s, stream);
849 }
851
852 return bufred[0];
853 }
854
859 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
860 cudaStream_t stream){
861 int pow2 = 1;
862 while(pow2 < (*j)){
863 pow2 = 2*pow2;
864 }
865 const int nt = 1024/pow2;
866 const dim3 nthrds(pow2, nt, 1);
867 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
868 const int nb = ((*n) + nt - 1)/nt;
869
871
872 if ( *n > 0) {
873 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
874 ((const real *) w, (const real **) v,
875 (const real *)mult, (real *)bufred_d, *j, *n);
878 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
880 }
881 else {
882 cuda_rzero(bufred_d, &red_s, stream);
883 }
884 cuda_global_reduce_add(h, bufred_d, (*j), stream);
885 }
886
891 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
892
893 const dim3 nthrds(1024, 1, 1);
894 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
895 const int nb = ((*n) + 1024 - 1)/ 1024;
896
898
899 if ( *n > 0) {
901 <<<nblcks, nthrds, 0, stream>>>((real *) a,
902 (real *) b,
903 (real *) bufred_d, *n);
905 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
907 }
908 else {
909 cuda_rzero(bufred_d, &red_s, stream);
910 }
912
913 return bufred[0];
914 }
915
920 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
921
922 const dim3 nthrds(1024, 1, 1);
923 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
924 const int nb = ((*n) + 1024 - 1)/ 1024;
925
927
928 if ( *n > 0) {
930 <<<nblcks, nthrds, 0, stream>>>((real *) a,
931 (real *) b,
932 (real *) bufred_d, *n);
934 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
936 }
937 else {
938 cuda_rzero(bufred_d, &red_s, stream);
939 }
941
942 return bufred[0];
943 }
944
949 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
950 const dim3 nthrds(1024, 1, 1);
951 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
952 const int nb = ((*n) + 1024 - 1)/ 1024;
953
955 if ( *n > 0) {
957 <<<nblcks, nthrds, 0, stream>>>((real *) a,
958 (real *) bufred_d, *n);
960 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
962 }
963 else {
964 cuda_rzero(bufred_d, &red_s, stream);
965 }
966
968
969 return bufred[0];
970 }
971
976 real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream) {
977 const dim3 nthrds(1024, 1, 1);
978 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
979 const int nb = ((*n) + 1024 - 1)/ 1024;
980
982 if ( *n > 0) {
984 <<<nblcks, nthrds, 0, stream>>>((real *) a, *ninf,
985 (real *) bufred_d, *n);
987 reduce_max_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
988 *ninf, nb);
990 }
991 else {
992 cuda_rzero(bufred_d, &red_s, stream);
993 }
994
996
997 return bufred[0];
998 }
999
1004 real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream) {
1005 const dim3 nthrds(1024, 1, 1);
1006 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1007 const int nb = ((*n) + 1024 - 1)/ 1024;
1008
1010 if ( *n > 0) {
1012 <<<nblcks, nthrds, 0, stream>>>((real *) a, *pinf,
1013 (real *) bufred_d, *n);
1015 reduce_min_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
1016 *pinf, nb);
1018 }
1019 else {
1020 cuda_rzero(bufred_d, &red_s, stream);
1021 }
1022
1024
1025 return bufred[0];
1026 }
1027
1032 void cuda_absval(void *a, int *n, cudaStream_t stream) {
1033
1034 const dim3 nthrds(1024, 1, 1);
1035 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1036
1038 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
1040
1041 }
1042
1043 // ======================================================================== //
1044 // Point-wise operations.
1045
1050 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1051
1052 const dim3 nthrds(1024, 1, 1);
1053 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1054
1055 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1056 ((real *)a, (real *)b, *n);
1058 }
1059
1064 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1065
1066 const dim3 nthrds(1024, 1, 1);
1067 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1068
1069 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1070 ((real *)a, (real *)b, (real *)c, *n);
1072 }
1073
1078 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1079
1080 const dim3 nthrds(1024, 1, 1);
1081 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1082
1083 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1084 ((real *)a, *c, *n);
1086 }
1087
1092 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1093
1094 const dim3 nthrds(1024, 1, 1);
1095 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1096
1097 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1098 ((real *)a, (real *)b, *c, *n);
1100 }
1101
1106 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1107
1108 const dim3 nthrds(1024, 1, 1);
1109 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1110
1111 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1112 ((real *)a, (real *)b, *n);
1114 }
1115
1120 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1121
1122 const dim3 nthrds(1024, 1, 1);
1123 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1124
1125 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1126 ((real *)a, (real *)b, (real *)c, *n);
1128 }
1129
1134 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1135
1136 const dim3 nthrds(1024, 1, 1);
1137 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1138
1139 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
1141 }
1142
1147 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1148
1149 const dim3 nthrds(1024, 1, 1);
1150 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1151
1152 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1153 ((real *)a, (real *)b, *c, *n);
1155 }
1156
1157 // ======================================================================== //
1158
1162 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
1163
1164 const dim3 nthrds(1024, 1, 1);
1165 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1166
1167 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
1168 ((int *) a, *c, *n);
1170
1171 }
1172
1173} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_MAX
#define DEVICE_MPI_MIN
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_MAX
#define DEVICE_NCCL_MIN
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_global_reduce_min(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:765
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1092
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:595
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:369
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:920
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:625
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:419
void cuda_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cudaStream_t strm)
Definition math.cu:112
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:691
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:891
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:581
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1106
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1064
real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream)
Definition math.cu:1004
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:207
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:353
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:524
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:386
void * bufred_d
Definition math.cu:663
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:567
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:173
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:483
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1078
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:67
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:511
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:144
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:308
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:668
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:194
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:337
void cuda_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:158
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:553
real * bufred
Definition math.cu:662
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:497
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:642
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:82
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:949
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:832
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:804
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1147
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:262
int red_s
Definition math.cu:661
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:403
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:323
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
Definition math.cu:610
void cuda_cwrap(void *a, real *min_val, real *max_val, int *n, cudaStream_t strm)
Definition math.cu:277
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1134
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:292
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:859
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:1032
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:187
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:234
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:221
void cuda_global_reduce_max(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:728
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:1162
void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:97
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1050
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:130
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
Definition math.cu:453
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:470
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:248
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
Definition math.cu:436
real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream)
Definition math.cu:976
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:538
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1120
Object for handling masks in Neko.
Definition mask.f90:34