Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n, cudaStream_t strm) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 }
63
68 void cuda_masked_copy_0(void *a, void *b, void *mask,
69 int *n, int *m, cudaStream_t strm) {
70
71 const dim3 nthrds(1024, 1, 1);
72 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
73
75 ((real *) a, (real*) b,(int*) mask, *n, *m);
77
78 }
79
84 void cuda_masked_copy_aligned(void *a, void *b, void *mask,
85 int *n, int *m, cudaStream_t strm) {
86
87 const dim3 nthrds(1024, 1, 1);
88 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
89
91 ((real *) a, (real*) b,(int*) mask, *n, *m);
93
94 }
95
99 void cuda_masked_gather_copy(void *a, void *b, void *mask,
100 int *n, int *m, cudaStream_t strm) {
101
102 const dim3 nthrds(1024, 1, 1);
103 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
104
106 ((real *) a, (real*) b,(int*) mask, *n, *m);
108
109 }
110
114 void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask,
115 int *n, int *m, cudaStream_t strm) {
116
117 const dim3 nthrds(1024, 1, 1);
118 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
119
121 ((real *) a, (real*) b,(int*) mask, *n, *m);
123
124 }
125
129 void cuda_face_masked_gather_copy(void *a, void *b, void *mask,
130 void *facet, int *n1, int *n2, int *lx,
131 int *ly, int *lz, int *m,
133
134 const dim3 nthrds(1024, 1, 1);
135 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
136
138 ((real *) a, (real *) b, (int *) mask, (int *) facet, *n1, *n2, *lx,
139 *ly, *lz, *m);
141
142 }
143
147 void cuda_masked_atomic_reduction(void *a, void *b, void *mask,
148 int *n, int *m, cudaStream_t strm) {
149
150 const dim3 nthrds(1024, 1, 1);
151 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
152
154 ((real *) a, (real *) b, (int *) mask, *n, *m);
156
157 }
161 void cuda_masked_scatter_copy(void *a, void *b, void *mask,
162 int *n, int *m, cudaStream_t strm) {
163
164 const dim3 nthrds(1024, 1, 1);
165 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
166
168 ((real *) a, (real*) b,(int*) mask, *n, *m);
170 }
171
175 void cuda_masked_scatter_copy_aligned(void *a, void *b, void *mask,
176 int *n, int *m, cudaStream_t strm) {
177
178 const dim3 nthrds(1024, 1, 1);
179 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
180
182 ((real *) a, (real*) b,(int*) mask, *n, *m);
184 }
185
186
190 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size,
192
193 const dim3 nthrds(1024, 1, 1);
194 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
195
197 ((real*)a, *c, *size, mask, *mask_size);
199 }
200
204 void cuda_rzero(void *a, int *n, cudaStream_t strm) {
205 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real), strm));
206 }
207
211 void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm) {
212
213 const dim3 nthrds(1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
215
216 cmult_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
218
219 }
220
224 void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
225
226 const dim3 nthrds(1024, 1, 1);
227 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
228
230 ((real *) a, (real *) b, *c, *n);
232
233 }
234
238 void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm) {
239
240 const dim3 nthrds(1024, 1, 1);
241 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
242
243 cdiv_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
245
246 }
247
251 void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
252
253 const dim3 nthrds(1024, 1, 1);
254 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
255
257 ((real *) a, (real *) b, *c, *n);
259
260 }
261
265 void cuda_radd(void *a, real *c, int *n, cudaStream_t strm) {
266
267 const dim3 nthrds(1024, 1, 1);
268 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
269
270 cadd_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
272
273 }
274
279 void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm) {
280
281 const dim3 nthrds(1024, 1, 1);
282 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
283
285 ((real *) a, (real *) b, *c, *n);
287
288 }
289
294 void cuda_cwrap(void *a, real *min_val, real *max_val, int *n,
296
297 const dim3 nthrds(1024, 1, 1);
298 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
299
301 ((real *) a, *min_val, *max_val, *n);
303
304 }
305
309 void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm) {
310
311 const dim3 nthrds(1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
313
314 if (*n > 0){
315 cfill_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *c, *n);
317 }
318
319 }
320
325 void cuda_add2(void *a, void *b, int *n, cudaStream_t strm) {
326
327 const dim3 nthrds(1024, 1, 1);
328 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
329
331 ((real *) a, (real *) b, *n);
333
334 }
335
340 void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
341
342 const dim3 nthrds(1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
344
346 ((real *) a, (real *) b, (real *) c, *n);
348 }
349
354 void cuda_add4(void *a, void *b, void *c, void *d, int *n,
356
357 const dim3 nthrds(1024, 1, 1);
358 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
359
361 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
363
364 }
370 void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
371
372 const dim3 nthrds(1024, 1, 1);
373 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
374
376 ((real *) a, (real *) b, *c1, *n);
378
379 }
380
386 void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
387
388 const dim3 nthrds(1024, 1, 1);
389 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
390
392 ((real *) a, (real *) b, *c1, *n);
394
395 }
396
403 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
405
406 const dim3 nthrds(1024, 1, 1);
407 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
408
410 ((real *) x, (const real **) p, (real *) alpha, *j, *n);
412
413 }
414
420 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm) {
421
422 const dim3 nthrds(1024, 1, 1);
423 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
424
426 ((real *) a, (real *) b, *c1, *n);
428
429 }
430
436 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
438
439 const dim3 nthrds(1024, 1, 1);
440 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
441
443 ((real *) a, (real *) b, (real *) c, *c1, *c2, *n);
445
446 }
447
453 void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
454 real *c3, int *n, cudaStream_t strm) {
455
456 const dim3 nthrds(1024, 1, 1);
457 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
458
460 ((real *) a, (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
462
463 }
464
470 void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
471 real *c2, real *c3, real *c4, int *n, cudaStream_t strm) {
472
473 const dim3 nthrds(1024, 1, 1);
474 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
475
477 ((real *) a, (real *) b, (real *) c, (real *) d, (real *) e,
478 *c1, *c2, *c3, *c4, *n);
480
481 }
482
487 void cuda_invcol1(void *a, int *n, cudaStream_t strm) {
488
489 const dim3 nthrds(1024, 1, 1);
490 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
491
492 invcol1_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, *n);
494 }
495
500 void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm) {
501
502 const dim3 nthrds(1024, 1, 1);
503 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
504
506 ((real *) a, (real *) b, *n);
508 }
509
514 void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
515
516 const dim3 nthrds(1024, 1, 1);
517 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
518
520 ((real *) a, (real *) b, (real *) c, *n);
522 }
523
528 void cuda_col2(void *a, void *b, int *n, cudaStream_t strm) {
529
530 const dim3 nthrds(1024, 1, 1);
531 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
532
533 col2_kernel<real><<<nblcks, nthrds, 0, strm>>>((real *) a, (real *) b, *n);
535 }
536
541 void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
542
543 const dim3 nthrds(1024, 1, 1);
544 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
545
547 ((real *) a, (real *) b, (real *) c, *n);
549 }
550
555 void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
556
557 const dim3 nthrds(1024, 1, 1);
558 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
559
561 ((real *) a, (real *) b, (real *) c, *n);
563 }
564
565
570 void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm) {
571
572 const dim3 nthrds(1024, 1, 1);
573 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
574
576 ((real *) a, (real *) b, *n);
578 }
579
584 void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
585
586 const dim3 nthrds(1024, 1, 1);
587 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
588
590 ((real *) a, (real *) b, (real *) c, *n);
592 }
593
598 void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm) {
599
600 const dim3 nthrds(1024, 1, 1);
601 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
602
604 ((real *) a, (real *) b, (real *) c, *n);
606 }
607
612 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n,
614
615 const dim3 nthrds(1024, 1, 1);
616 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
617
619 ((real *) a, (real *) b, (real *) c, (real *) d, *n);
621 }
622
627 void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n,
629
630 const dim3 nthrds(1024, 1, 1);
631 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
632
634 ((real *) a, (real *) b, (real *) c, *s, *n);
636 }
637
642 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
643 void *v1, void *v2, void *v3, int *n,
645
646 const dim3 nthrds(1024, 1, 1);
647 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
648
650 ((real *) dot, (real *) u1, (real *) u2, (real *) u3,
651 (real *) v1, (real *) v2, (real *) v3, *n);
653 }
654
659 void cuda_vcross(void *u1, void *u2, void *u3,
660 void *v1, void *v2, void *v3,
661 void *w1, void *w2, void *w3,
662 int *n, cudaStream_t strm) {
663
664 const dim3 nthrds(1024, 1, 1);
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666
668 ((real *) u1, (real *) u2, (real *) u3,
669 (real *) v1, (real *) v2, (real *) v3,
670 (real *) w1, (real *) w2, (real *) w3, *n);
672 }
673
674
675 /*
676 * Reduction buffer
677 */
678 int red_s = 0;
680 void * bufred_d = NULL;
681
686 if ( nb >= red_s) {
687 red_s = nb+1;
688 if (bufred != NULL) {
690#ifdef HAVE_NVSHMEM
692#else
694#endif
695 }
697#ifdef HAVE_NVSHMEM
698 bufred_d = (real *) nvshmem_malloc(red_s*sizeof(real));
699#else
701#endif
702 }
703 }
704
709 int n, const cudaStream_t stream) {
710
711#ifdef HAVE_NCCL
713 DEVICE_NCCL_SUM, stream);
715 cudaMemcpyDeviceToHost, stream));
716 cudaStreamSynchronize(stream);
717#elif HAVE_NVSHMEM
718 if (sizeof(real) == sizeof(float)) {
720 (float *) bufred_d,
721 (float *) bufred_d, n, stream);
722 }
723 else if (sizeof(real) == sizeof(double)) {
725 (double *) bufred_d,
726 (double *) bufred_d, n, stream);
727
728 }
730 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
731 cudaStreamSynchronize(stream);
732#elif HAVE_DEVICE_MPI
733 cudaStreamSynchronize(stream);
735#else
737 cudaMemcpyDeviceToHost, stream));
738 cudaStreamSynchronize(stream);
739#endif
740 }
741
746 int n, const cudaStream_t stream) {
747
748#ifdef HAVE_NCCL
750 DEVICE_NCCL_MAX, stream);
752 cudaMemcpyDeviceToHost, stream));
753 cudaStreamSynchronize(stream);
754#elif HAVE_NVSHMEM
755 if (sizeof(real) == sizeof(float)) {
757 (float *) bufred_d,
758 (float *) bufred_d, n, stream);
759 }
760 else if (sizeof(real) == sizeof(double)) {
762 (double *) bufred_d,
763 (double *) bufred_d, n, stream);
764
765 }
767 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
768 cudaStreamSynchronize(stream);
769#elif HAVE_DEVICE_MPI
770 cudaStreamSynchronize(stream);
772#else
774 cudaMemcpyDeviceToHost, stream));
775 cudaStreamSynchronize(stream);
776#endif
777 }
778
783 int n, const cudaStream_t stream) {
784
785#ifdef HAVE_NCCL
787 DEVICE_NCCL_MIN, stream);
789 cudaMemcpyDeviceToHost, stream));
790 cudaStreamSynchronize(stream);
791#elif HAVE_NVSHMEM
792 if (sizeof(real) == sizeof(float)) {
794 (float *) bufred_d,
795 (float *) bufred_d, n, stream);
796 }
797 else if (sizeof(real) == sizeof(double)) {
799 (double *) bufred_d,
800 (double *) bufred_d, n, stream);
801
802 }
804 sizeof(real)*n, cudaMemcpyDeviceToHost, stream));
805 cudaStreamSynchronize(stream);
806#elif HAVE_DEVICE_MPI
807 cudaStreamSynchronize(stream);
809#else
811 cudaMemcpyDeviceToHost, stream));
812 cudaStreamSynchronize(stream);
813#endif
814 }
815
816
821 real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream) {
822
823 const dim3 nthrds(1024, 1, 1);
824 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
825 const int nb = ((*n) + 1024 - 1)/ 1024;
826
828
829 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
830 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
832 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
834
836 cudaMemcpyDeviceToHost, stream));
837 cudaStreamSynchronize(stream);
838
839 return bufred[0];
840 }
841
842
843
844
849 real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
850
851 const dim3 nthrds(1024, 1, 1);
852 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
853 const int nb = ((*n) + 1024 - 1)/ 1024;
854
856
857 if ( *n > 0) {
858 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
859 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
861 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
863 }
864 else {
865 cuda_rzero(bufred_d, &red_s, stream);
866 }
868
869 return bufred[0];
870 }
871
876 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
877 cudaStream_t stream){
878 int pow2 = 1;
879 while(pow2 < (*j)){
880 pow2 = 2*pow2;
881 }
882 const int nt = 1024/pow2;
883 const dim3 nthrds(pow2, nt, 1);
884 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
885 const int nb = ((*n) + nt - 1)/nt;
886
888
889 if ( *n > 0) {
890 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
891 ((const real *) w, (const real **) v,
892 (const real *)mult, (real *)bufred_d, *j, *n);
895 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
897 }
898 else {
899 cuda_rzero(bufred_d, &red_s, stream);
900 }
901 cuda_global_reduce_add(h, bufred_d, (*j), stream);
902 }
903
908 real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream) {
909
910 const dim3 nthrds(1024, 1, 1);
911 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
912 const int nb = ((*n) + 1024 - 1)/ 1024;
913
915
916 if ( *n > 0) {
918 <<<nblcks, nthrds, 0, stream>>>((real *) a,
919 (real *) b,
920 (real *) bufred_d, *n);
922 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
924 }
925 else {
926 cuda_rzero(bufred_d, &red_s, stream);
927 }
929
930 return bufred[0];
931 }
932
937 real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream) {
938
939 const dim3 nthrds(1024, 1, 1);
940 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
941 const int nb = ((*n) + 1024 - 1)/ 1024;
942
944
945 if ( *n > 0) {
947 <<<nblcks, nthrds, 0, stream>>>((real *) a,
948 (real *) b,
949 (real *) bufred_d, *n);
951 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
953 }
954 else {
955 cuda_rzero(bufred_d, &red_s, stream);
956 }
958
959 return bufred[0];
960 }
961
966 real cuda_glsum(void *a, int *n, cudaStream_t stream) {
967 const dim3 nthrds(1024, 1, 1);
968 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
969 const int nb = ((*n) + 1024 - 1)/ 1024;
970
972 if ( *n > 0) {
974 <<<nblcks, nthrds, 0, stream>>>((real *) a,
975 (real *) bufred_d, *n);
977 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
979 }
980 else {
981 cuda_rzero(bufred_d, &red_s, stream);
982 }
983
985
986 return bufred[0];
987 }
988
993 real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream) {
994 const dim3 nthrds(1024, 1, 1);
995 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
996 const int nb = ((*n) + 1024 - 1)/ 1024;
997
999 if ( *n > 0) {
1001 <<<nblcks, nthrds, 0, stream>>>((real *) a, *ninf,
1002 (real *) bufred_d, *n);
1004 reduce_max_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
1005 *ninf, nb);
1007 }
1008 else {
1009 cuda_rzero(bufred_d, &red_s, stream);
1010 }
1011
1013
1014 return bufred[0];
1015 }
1016
1021 real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream) {
1022 const dim3 nthrds(1024, 1, 1);
1023 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1024 const int nb = ((*n) + 1024 - 1)/ 1024;
1025
1027 if ( *n > 0) {
1029 <<<nblcks, nthrds, 0, stream>>>((real *) a, *pinf,
1030 (real *) bufred_d, *n);
1032 reduce_min_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d,
1033 *pinf, nb);
1035 }
1036 else {
1037 cuda_rzero(bufred_d, &red_s, stream);
1038 }
1039
1041
1042 return bufred[0];
1043 }
1044
1049 void cuda_absval(void *a, int *n, cudaStream_t stream) {
1050
1051 const dim3 nthrds(1024, 1, 1);
1052 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1053
1055 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
1057
1058 }
1059
1060 // ======================================================================== //
1061 // Point-wise operations.
1062
1067 void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1068
1069 const dim3 nthrds(1024, 1, 1);
1070 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1071
1072 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1073 ((real *)a, (real *)b, *n);
1075 }
1076
1081 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1082
1083 const dim3 nthrds(1024, 1, 1);
1084 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1085
1086 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1087 ((real *)a, (real *)b, (real *)c, *n);
1089 }
1090
1095 void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1096
1097 const dim3 nthrds(1024, 1, 1);
1098 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1099
1100 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1101 ((real *)a, *c, *n);
1103 }
1104
1109 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1110
1111 const dim3 nthrds(1024, 1, 1);
1112 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1113
1114 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1115 ((real *)a, (real *)b, *c, *n);
1117 }
1118
1123 void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream) {
1124
1125 const dim3 nthrds(1024, 1, 1);
1126 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1127
1128 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>
1129 ((real *)a, (real *)b, *n);
1131 }
1132
1137 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream) {
1138
1139 const dim3 nthrds(1024, 1, 1);
1140 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1141
1142 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1143 ((real *)a, (real *)b, (real *)c, *n);
1145 }
1146
1151 void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream) {
1152
1153 const dim3 nthrds(1024, 1, 1);
1154 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1155
1156 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>((real *)a, *c, *n);
1158 }
1159
1164 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream) {
1165
1166 const dim3 nthrds(1024, 1, 1);
1167 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1168
1169 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>
1170 ((real *)a, (real *)b, *c, *n);
1172 }
1173
1174 // ======================================================================== //
1175
1179 void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream) {
1180
1181 const dim3 nthrds(1024, 1, 1);
1182 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1183
1184 cadd_kernel<int><<<nblcks, nthrds, 0, stream>>>
1185 ((int *) a, *c, *n);
1187
1188 }
1189
1190} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_MAX
#define DEVICE_MPI_MIN
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_MAX
#define DEVICE_NCCL_MIN
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_global_reduce_min(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:782
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1109
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:612
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:386
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:937
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
Definition math.cu:642
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
Definition math.cu:436
void cuda_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cudaStream_t strm)
Definition math.cu:129
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:708
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:908
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:598
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1123
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1081
real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream)
Definition math.cu:1021
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:224
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:370
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:541
void cuda_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:68
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
Definition math.cu:403
void * bufred_d
Definition math.cu:680
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:584
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:59
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
Definition math.cu:190
void cuda_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:84
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:500
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1095
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:528
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:161
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:325
void cuda_redbuf_check_alloc(int nb)
Definition math.cu:685
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:211
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
Definition math.cu:354
void cuda_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:175
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
Definition math.cu:570
real * bufred
Definition math.cu:679
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:514
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
Definition math.cu:659
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:99
real cuda_glsum(void *a, int *n, cudaStream_t stream)
Definition math.cu:966
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:849
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
Definition math.cu:821
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
Definition math.cu:1164
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:279
int red_s
Definition math.cu:678
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
Definition math.cu:420
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:340
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
Definition math.cu:627
void cuda_cwrap(void *a, real *min_val, real *max_val, int *n, cudaStream_t strm)
Definition math.cu:294
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
Definition math.cu:1151
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:309
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
Definition math.cu:876
void cuda_absval(void *a, int *n, cudaStream_t stream)
Definition math.cu:1049
void cuda_rzero(void *a, int *n, cudaStream_t strm)
Definition math.cu:204
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
Definition math.cu:251
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:238
void cuda_global_reduce_max(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
Definition math.cu:745
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
Definition math.cu:1179
void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:114
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
Definition math.cu:1067
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
Definition math.cu:147
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
Definition math.cu:470
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
Definition math.cu:487
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
Definition math.cu:265
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
Definition math.cu:453
real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream)
Definition math.cu:993
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
Definition math.cu:555
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Definition math.cu:1137
Object for handling masks in Neko.
Definition mask.f90:34