Loading [MathJax]/extensions/tex2jax.js
Neko 0.9.99
A portable framework for high-order spectral element flow simulations
All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros Pages
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#ifdef HAVE_NVSHMEM
42#include <nvshmem.h>
43#include <nvshmemx.h>
44#endif
45
46extern "C" {
47
50
51#ifdef HAVE_NCCL
54#endif
55
59 void cuda_copy(void *a, void *b, int *n) {
60 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
62 (cudaStream_t) glb_cmd_queue));
63 }
64
68 void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
69
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
72
74 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
76
77 }
78
82 void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
83
84 const dim3 nthrds(1024, 1, 1);
85 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
86
88 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
90
91 }
92
96 void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m) {
97
98 const dim3 nthrds(1024, 1, 1);
99 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
100
102 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
103 (int *) mask, *n, *m);
105
106 }
107
108
112 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size) {
113
114 const dim3 nthrds(1024, 1, 1);
115 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
116
117 cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
118 (real*)a, *c, *size, mask, *mask_size);
120 }
121
125 void cuda_rzero(void *a, int *n) {
126 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real),
127 (cudaStream_t) glb_cmd_queue));
128 }
129
133 void cuda_cmult(void *a, real *c, int *n) {
134
135 const dim3 nthrds(1024, 1, 1);
136 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
137
139 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
141
142 }
143
147 void cuda_cmult2(void *a, void *b, real *c, int *n) {
148
149 const dim3 nthrds(1024, 1, 1);
150 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
151
153 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
155
156 }
157
161 void cuda_cadd(void *a, real *c, int *n) {
162
163 const dim3 nthrds(1024, 1, 1);
164 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
165
167 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
169
170 }
171
176 void cuda_cadd2(void *a, void *b, real *c, int *n) {
177
178 const dim3 nthrds(1024, 1, 1);
179 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
180
182 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
184
185 }
186
190 void cuda_cfill(void *a, real *c, int *n) {
191
192 const dim3 nthrds(1024, 1, 1);
193 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
194
195 if (*n > 0){
197 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
199 }
200
201 }
202
207 void cuda_add2(void *a, void *b, int *n) {
208
209 const dim3 nthrds(1024, 1, 1);
210 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
211
213 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
215
216 }
217
222 void cuda_add3(void *a, void *b, void *c, int *n) {
223
224 const dim3 nthrds(1024, 1, 1);
225 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
226
228 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
229 (real *) c, *n);
231 }
232
237 void cuda_add4(void *a, void *b, void *c, void *d, int *n) {
238
239 const dim3 nthrds(1024, 1, 1);
240 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
241
243 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, (real *) d, *n);
245
246 }
252 void cuda_add2s1(void *a, void *b, real *c1, int *n) {
253
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
256
258 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
260
261 }
262
268 void cuda_add2s2(void *a, void *b, real *c1, int *n) {
269
270 const dim3 nthrds(1024, 1, 1);
271 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
272
274 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
276
277 }
278
285 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
286
287 const dim3 nthrds(1024, 1, 1);
288 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
289
291 (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
292 (real *) alpha, *j, *n);
294
295 }
296
302 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
303
304 const dim3 nthrds(1024, 1, 1);
305 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
306
307 addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
308 (real *) b,
309 *c1, *n);
311
312 }
313
319 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
320
321 const dim3 nthrds(1024, 1, 1);
322 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
323
325 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c,
326 *c1, *c2, *n);
328
329 }
330
335 void cuda_invcol1(void *a, int *n) {
336
337 const dim3 nthrds(1024, 1, 1);
338 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
339
341 (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
343 }
344
349 void cuda_invcol2(void *a, void *b, int *n) {
350
351 const dim3 nthrds(1024, 1, 1);
352 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
353
354 invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
355 (real *) b, *n);
357 }
358
363 void cuda_col2(void *a, void *b, int *n) {
364
365 const dim3 nthrds(1024, 1, 1);
366 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
367
369 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
371 }
372
377 void cuda_col3(void *a, void *b, void *c, int *n) {
378
379 const dim3 nthrds(1024, 1, 1);
380 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
381
383 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
385 }
386
391 void cuda_subcol3(void *a, void *b, void *c, int *n) {
392
393 const dim3 nthrds(1024, 1, 1);
394 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
395
397 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
399 }
400
401
406 void cuda_sub2(void *a, void *b, int *n) {
407
408 const dim3 nthrds(1024, 1, 1);
409 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
410
412 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
414 }
415
420 void cuda_sub3(void *a, void *b, void *c, int *n) {
421
422 const dim3 nthrds(1024, 1, 1);
423 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
424
426 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
427 (real *) c, *n);
429 }
430
435 void cuda_addcol3(void *a, void *b, void *c, int *n) {
436
437 const dim3 nthrds(1024, 1, 1);
438 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
439
441 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
442 (real *) c, *n);
444 }
445
450 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n) {
451
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454
456 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
457 (real *) c, (real *) d, *n);
459 }
460
465 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
466 void *v1, void *v2, void *v3, int *n) {
467
468 const dim3 nthrds(1024, 1, 1);
469 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
470
472 (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
473 (real *) u2, (real *) u3,
474 (real *) v1, (real *) v2,
475 (real *) v3, *n);
477 }
478
483 void cuda_vcross(void *u1, void *u2, void *u3,
484 void *v1, void *v2, void *v3,
485 void *w1, void *w2, void *w3, int *n) {
486
487 const dim3 nthrds(1024, 1, 1);
488 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
489
491 (cudaStream_t) glb_cmd_queue>>>((real *) u1,
492 (real *) u2, (real *) u3,
493 (real *) v1, (real *) v2,
494 (real *) v3,
495 (real *) w1, (real *) w2,
496 (real *) w3, *n);
498 }
499
500
501 /*
502 * Reduction buffer
503 */
504 int red_s = 0;
506 void * bufred_d = NULL;
507
512 real cuda_vlsc3(void *u, void *v, void *w, int *n) {
513
514 const dim3 nthrds(1024, 1, 1);
515 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
516 const int nb = ((*n) + 1024 - 1)/ 1024;
517 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
518
519 if ( nb > red_s){
520 red_s = nb;
521 if (bufred != NULL) {
524 }
527 }
528
529 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
530 ((real *) u, (real *) v, (real *) w, (real *) bufred_d, *n);
532 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
534
536 cudaMemcpyDeviceToHost, stream));
537 cudaStreamSynchronize(stream);
538
539 return bufred[0];
540 }
541
546 real cuda_glsc3(void *a, void *b, void *c, int *n) {
547
548 const dim3 nthrds(1024, 1, 1);
549 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
550 const int nb = ((*n) + 1024 - 1)/ 1024;
551 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
552
553 if ( nb > red_s){
554 red_s = nb;
555 if (bufred != NULL) {
557#ifdef HAVE_NVSHMEM
559#else
561#endif
562 }
564#ifdef HAVE_NVSHMEM
565 bufred_d = (real *) nvshmem_malloc(sizeof(real));
566#else
568#endif
569 }
570
571 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
572 ((real *) a, (real *) b, (real *) c, (real *) bufred_d, *n);
574 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
576
577#ifdef HAVE_NCCL
579 DEVICE_NCCL_SUM, stream);
581 cudaMemcpyDeviceToHost, stream));
582 cudaStreamSynchronize(stream);
583#elif HAVE_NVSHMEM
584 if (sizeof(real) == sizeof(float)) {
586 (float *) bufred_d,
587 (float *) bufred_d, 1, stream);
588 }
589 else if (sizeof(real) == sizeof(double)) {
591 (double *) bufred_d,
592 (double *) bufred_d, 1, stream);
593
594 }
596 sizeof(real), cudaMemcpyDeviceToHost, stream));
597 cudaStreamSynchronize(stream);
598#elif HAVE_DEVICE_MPI
599 cudaStreamSynchronize(stream);
601#else
603 cudaMemcpyDeviceToHost, stream));
604 cudaStreamSynchronize(stream);
605#endif
606
607 return bufred[0];
608 }
609
614 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
615 int pow2 = 1;
616 while(pow2 < (*j)){
617 pow2 = 2*pow2;
618 }
619 const int nt = 1024/pow2;
620 const dim3 nthrds(pow2, nt, 1);
621 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
622 const int nb = ((*n) + nt - 1)/nt;
623 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
624
625 if((*j)*nb>red_s){
626 red_s = (*j)*nb;
627 if (bufred != NULL) {
629#ifdef HAVE_NVSHMEM
631#else
633#endif
634 }
635 CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
636#ifdef HAVE_NVSHMEM
637 bufred_d = (real *) nvshmem_malloc((*j)*nb*sizeof(real));
638#else
639 CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
640#endif
641 }
642
643 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
644 ((const real *) w, (const real **) v,
645 (const real *)mult, (real *)bufred_d, *j, *n);
648 <<<(*j), 1024, 0, stream>>>((real *) bufred_d, nb, *j);
650
651#ifdef HAVE_NCCL
653 DEVICE_NCCL_SUM, stream);
654 CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
655 cudaMemcpyDeviceToHost, stream));
656 cudaStreamSynchronize(stream);
657#elif HAVE_NVSHMEM
658 if (sizeof(real) == sizeof(float)) {
660 (float *) bufred_d,
661 (float *) bufred_d, (*j), stream);
662 }
663 else if (sizeof(real) == sizeof(double)) {
665 (double *) bufred_d,
666 (double *) bufred_d, (*j), stream);
667 }
668 CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
669 cudaMemcpyDeviceToHost, stream));
670 cudaStreamSynchronize(stream);
671#elif HAVE_DEVICE_MPI
672 cudaStreamSynchronize(stream);
674#else
675 CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
676 cudaMemcpyDeviceToHost, stream));
677 cudaStreamSynchronize(stream);
678#endif
679 }
680
685 real cuda_glsc2(void *a, void *b, int *n) {
686
687 const dim3 nthrds(1024, 1, 1);
688 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
689 const int nb = ((*n) + 1024 - 1)/ 1024;
690 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
691
692 if ( nb > red_s){
693 red_s = nb;
694 if (bufred != NULL) {
696#ifdef HAVE_NVSHMEM
698#else
700#endif
701 }
703#ifdef HAVE_NVSHMEM
704 bufred_d = (real *) nvshmem_malloc(nb*sizeof(real));
705#else
707#endif
708 }
709
711 <<<nblcks, nthrds, 0, stream>>>((real *) a,
712 (real *) b,
713 (real *) bufred_d, *n);
715 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
717
718#ifdef HAVE_NCCL
720 DEVICE_NCCL_SUM, stream);
722 cudaMemcpyDeviceToHost, stream));
723 cudaStreamSynchronize(stream);
724#elif HAVE_NVSHMEM
725 if (sizeof(real) == sizeof(float)) {
727 (float *) bufred_d,
728 (float *) bufred_d, 1, stream);
729 }
730 else if (sizeof(real) == sizeof(double)) {
732 (double *) bufred_d,
733 (double *) bufred_d, 1, stream);
734
735 }
737 bufred_d,
738 sizeof(real), cudaMemcpyDeviceToHost, stream));
739 cudaStreamSynchronize(stream);
740#elif HAVE_DEVICE_MPI
741 cudaStreamSynchronize(stream);
743#else
745 cudaMemcpyDeviceToHost, stream));
746 cudaStreamSynchronize(stream);
747#endif
748
749 return bufred[0];
750 }
751
756 real cuda_glsum(void *a, int *n) {
757 const dim3 nthrds(1024, 1, 1);
758 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
759 const int nb = ((*n) + 1024 - 1)/ 1024;
760 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
761
762 if ( nb > red_s){
763 red_s = nb;
764 if (bufred != NULL) {
766#ifdef HAVE_NVSHMEM
768#else
770#endif
771 }
773#ifdef HAVE_NVSHMEM
774 bufred_d = (real *) nvshmem_malloc(sizeof(real));
775#else
777#endif
778 }
779
780 if ( *n > 0) {
782 <<<nblcks, nthrds, 0, stream>>>((real *) a,
783 (real *) bufred_d, *n);
785 reduce_kernel<real><<<1, 1024, 0, stream>>> ((real *) bufred_d, nb);
787 }
788#ifdef HAVE_NCCL
790 DEVICE_NCCL_SUM, stream);
792 cudaMemcpyDeviceToHost, stream));
793 cudaStreamSynchronize(stream);
794#elif HAVE_NVSHMEM
795 if (sizeof(real) == sizeof(float)) {
797 (float *) bufred_d,
798 (float *) bufred_d, 1, stream);
799 }
800 else if (sizeof(real) == sizeof(double)) {
802 (double *) bufred_d,
803 (double *) bufred_d, 1, stream);
804
805 }
807 bufred_d,
808 sizeof(real), cudaMemcpyDeviceToHost, stream));
809 cudaStreamSynchronize(stream);
810#elif HAVE_DEVICE_MPI
811 cudaStreamSynchronize(stream);
813#else
815 cudaMemcpyDeviceToHost, stream));
816 cudaStreamSynchronize(stream);
817#endif
818
819 return bufred[0];
820 }
821
826 void cuda_absval(void *a, int *n) {
827
828 const dim3 nthrds(1024, 1, 1);
829 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
830 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
831
833 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
835
836 }
837
838 // ======================================================================== //
839 // Point-wise operations.
840
845 void cuda_pwmax_vec2(void *a, void *b, int *n) {
846
847 const dim3 nthrds(1024, 1, 1);
848 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
849 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
850
851 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
852 (real *)a, (real *)b, *n);
854 }
855
860 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n) {
861
862 const dim3 nthrds(1024, 1, 1);
863 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
864 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
865
866 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
867 (real *)a, (real *)b, (real *)c, *n);
869 }
870
875 void cuda_pwmax_sca2(void *a, real *c, int *n) {
876
877 const dim3 nthrds(1024, 1, 1);
878 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
879 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
880
881 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
882 (real *)a, *c, *n);
884 }
885
890 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n) {
891
892 const dim3 nthrds(1024, 1, 1);
893 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
894 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
895
896 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
897 (real *)a, (real *)b, *c, *n);
899 }
900
905 void cuda_pwmin_vec2(void *a, void *b, int *n) {
906
907 const dim3 nthrds(1024, 1, 1);
908 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
909 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
910
911 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
912 (real *)a, (real *)b, *n);
914 }
915
920 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n) {
921
922 const dim3 nthrds(1024, 1, 1);
923 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
924 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
925
926 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
927 (real *)a, (real *)b, (real *)c, *n);
929 }
930
935 void cuda_pwmin_sca2(void *a, real *c, int *n) {
936
937 const dim3 nthrds(1024, 1, 1);
938 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
939 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
940
941 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
942 (real *)a, *c, *n);
944 }
945
950 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n) {
951
952 const dim3 nthrds(1024, 1, 1);
953 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
954 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
955
956 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
957 (real *)a, (real *)b, *c, *n);
959 }
960
961} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.cu:82
void cuda_absval(void *a, int *n)
Definition math.cu:826
void cuda_invcol1(void *a, int *n)
Definition math.cu:335
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition math.cu:285
void cuda_cadd2(void *a, void *b, real *c, int *n)
Definition math.cu:176
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n)
Definition math.cu:890
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n)
Definition math.cu:920
real cuda_vlsc3(void *u, void *v, void *w, int *n)
Definition math.cu:512
void cuda_add2s2(void *a, void *b, real *c1, int *n)
Definition math.cu:268
void * bufred_d
Definition math.cu:506
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.cu:68
void cuda_add3(void *a, void *b, void *c, int *n)
Definition math.cu:222
void cuda_col2(void *a, void *b, int *n)
Definition math.cu:363
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition math.cu:614
void cuda_add4(void *a, void *b, void *c, void *d, int *n)
Definition math.cu:237
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition math.cu:465
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n)
Definition math.cu:950
void cuda_addcol3(void *a, void *b, void *c, int *n)
Definition math.cu:435
void cuda_pwmax_sca2(void *a, real *c, int *n)
Definition math.cu:875
void cuda_subcol3(void *a, void *b, void *c, int *n)
Definition math.cu:391
void cuda_cmult(void *a, real *c, int *n)
Definition math.cu:133
real * bufred
Definition math.cu:505
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition math.cu:302
void cuda_add2s1(void *a, void *b, real *c1, int *n)
Definition math.cu:252
real cuda_glsum(void *a, int *n)
Definition math.cu:756
real cuda_glsc2(void *a, void *b, int *n)
Definition math.cu:685
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition math.cu:319
int red_s
Definition math.cu:504
void cuda_rzero(void *a, int *n)
Definition math.cu:125
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m)
Definition math.cu:96
void cuda_pwmin_sca2(void *a, real *c, int *n)
Definition math.cu:935
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n)
Definition math.cu:860
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition math.cu:450
void cuda_add2(void *a, void *b, int *n)
Definition math.cu:207
void cuda_copy(void *a, void *b, int *n)
Definition math.cu:59
void cuda_pwmax_vec2(void *a, void *b, int *n)
Definition math.cu:845
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
Definition math.cu:112
void cuda_invcol2(void *a, void *b, int *n)
Definition math.cu:349
void cuda_pwmin_vec2(void *a, void *b, int *n)
Definition math.cu:905
void cuda_col3(void *a, void *b, void *c, int *n)
Definition math.cu:377
void cuda_cfill(void *a, real *c, int *n)
Definition math.cu:190
void cuda_cadd(void *a, real *c, int *n)
Definition math.cu:161
void cuda_sub2(void *a, void *b, int *n)
Definition math.cu:406
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition math.cu:483
real cuda_glsc3(void *a, void *b, void *c, int *n)
Definition math.cu:546
void cuda_cmult2(void *a, void *b, real *c, int *n)
Definition math.cu:147
void cuda_sub3(void *a, void *b, void *c, int *n)
Definition math.cu:420