Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.hip
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include <hip/hip_runtime.h>
37#include <device/hip/check.h>
38#include "math_kernel.h"
39
40extern "C" {
41
44
45#ifdef HAVE_RCCL
48#endif
49
53 void hip_copy(void *a, void *b, int *n, hipStream_t strm) {
54 HIP_CHECK(hipMemcpyAsync(a, b, (*n) * sizeof(real),
56 }
57
61 void hip_masked_copy_0(void *a, void *b, void *mask,
62 int *n, int *m, hipStream_t strm) {
63
64 const dim3 nthrds(1024, 1, 1);
65 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
66
68 nblcks, nthrds, 0, strm, (real *) a,
69 (real *) b, (int *) mask, *n, *m);
70
72
73 }
74
78 void hip_masked_copy_aligned(void *a, void *b, void *mask,
79 int *n, int *m, hipStream_t strm) {
80
81 const dim3 nthrds(1024, 1, 1);
82 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
83
85 nblcks, nthrds, 0, strm, (real *) a,
86 (real *) b, (int *) mask, *n, *m);
87
89
90 }
91
95 void hip_masked_gather_copy(void *a, void *b, void *mask,
96 int *n, int *m, hipStream_t strm) {
97
98 const dim3 nthrds(1024, 1, 1);
99 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
100
102 nblcks, nthrds, 0, strm, (real *) a,
103 (real *) b, (int *) mask, *n, *m);
105
106 }
107
111 void hip_masked_gather_copy_aligned(void *a, void *b, void *mask,
112 int *n, int *m, hipStream_t strm) {
113
114 const dim3 nthrds(1024, 1, 1);
115 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
116
118 nblcks, nthrds, 0, strm, (real *) a,
119 (real *) b, (int *) mask, *n, *m);
121
122 }
123
127 void hip_face_masked_gather_copy(void *a, void *b, void *mask,
128 void *facet, int *n1, int *n2, int *lx,
129 int *ly, int *lz, int *m,
131
132 const dim3 nthrds(1024, 1, 1);
133 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
134
136 nblcks, nthrds, 0, strm, (real *) a, (real *) b,
137 (int *) mask, (int *) facet, *n1, *n2, *lx, *ly, *lz,
138 *m);
140
141 }
142
143
147 void hip_masked_scatter_copy(void *a, void *b, void *mask,
148 int *n, int *m, hipStream_t strm) {
149
150 const dim3 nthrds(1024, 1, 1);
151 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
152
154 nblcks, nthrds, 0, strm, (real *) a,
155 (real *) b, (int *) mask, *n, *m);
156
158
159 }
160
164 void hip_masked_scatter_copy_aligned(void *a, void *b, void *mask,
165 int *n, int *m, hipStream_t strm) {
166
167 const dim3 nthrds(1024, 1, 1);
168 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
169
171 nblcks, nthrds, 0, strm, (real *) a,
172 (real *) b, (int *) mask, *n, *m);
173
175
176 }
177
181 void hip_masked_atomic_reduction(void *a, void *b, void *mask,
182 int *n, int *m, hipStream_t strm) {
183
184 const dim3 nthrds(1024, 1, 1);
185 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
186
188 nblcks, nthrds, 0, strm, (real *) a,
189 (real *) b, (int *) mask, *n, *m);
190
192
193 }
194
198 void hip_cfill_mask(void* a, real* c, int* size, void* mask, int* mask_size,
200
201 const dim3 nthrds(1024, 1, 1);
202 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
203
205 nblcks, nthrds, 0, strm, (real*)a,
206 *c, *size, (int*)mask, *mask_size);
207
209 }
210
214 void hip_rzero(void *a, int *n, hipStream_t strm) {
215 HIP_CHECK(hipMemsetAsync(a, 0, (*n) * sizeof(real), strm));
216 }
217
221 void hip_cmult(void *a, real *c, int *n, hipStream_t strm) {
222
223 const dim3 nthrds(1024, 1, 1);
224 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
225
227 nblcks, nthrds, 0, strm, (real *) a, *c, *n);
229
230 }
231
235 void hip_cmult2(void *a, void *b, real *c, int *n, hipStream_t strm) {
236
237 const dim3 nthrds(1024, 1, 1);
238 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
239
241 nblcks, nthrds, 0, strm, (real *) a,(real *) b, *c, *n);
243
244 }
245
249 void hip_cdiv(void *a, real *c, int *n, hipStream_t strm) {
250
251 const dim3 nthrds(1024, 1, 1);
252 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
253
255 nblcks, nthrds, 0, strm, (real *) a, *c, *n);
257
258 }
259
263 void hip_cdiv2(void *a, void *b, real *c, int *n, hipStream_t strm) {
264
265 const dim3 nthrds(1024, 1, 1);
266 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
267
269 nblcks, nthrds, 0, strm, (real *) a,(real *) b, *c, *n);
271
272 }
273
277 void hip_radd(void *a, real *c, int *n, hipStream_t strm) {
278
279 const dim3 nthrds(1024, 1, 1);
280 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
281
283 nblcks, nthrds, 0, strm, (real *) a, *c, *n);
285 }
286
291 void hip_cadd2(void *a, void *b, real *c, int *n, hipStream_t strm) {
292
293 const dim3 nthrds(1024, 1, 1);
294 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
295
297 nblcks, nthrds, 0, strm, (real *) a, (real *) b, *c, *n);
299 }
300
305 void hip_cwrap(void *a, real *min_val, real *max_val, int *n,
307
308 const dim3 nthrds(1024, 1, 1);
309 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
310
312 nblcks, nthrds, 0, strm, (real *) a,
313 *min_val, *max_val, *n);
315 }
316
320 void hip_cfill(void *a, real *c, int *n, hipStream_t strm) {
321
322 const dim3 nthrds(1024, 1, 1);
323 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
324
325 if (*n > 0) {
327 nblcks, nthrds, 0, strm, (real *) a, *c, *n);
329 }
330
331 }
332
337 void hip_add2(void *a, void *b, int *n, hipStream_t strm) {
338
339 const dim3 nthrds(1024, 1, 1);
340 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
341
343 nblcks, nthrds, 0, strm, (real *) a, (real *) b, *n);
345
346 }
347
352 void hip_add3(void *a, void *b, void *c, int *n, hipStream_t strm) {
353
354 const dim3 nthrds(1024, 1, 1);
355 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
356
358 nblcks, nthrds, 0, strm, (real *) a,
359 (real *) b, (real *) c, *n);
361 }
362
367 void hip_add4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm) {
368
369 const dim3 nthrds(1024, 1, 1);
370 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
371
373 nblcks, nthrds, 0, strm, (real *) a,
374 (real *) b, (real *) c, (real *) d, *n);
376 }
377
383 void hip_add2s1(void *a, void *b, real *c1, int *n, hipStream_t strm) {
384
385 const dim3 nthrds(1024, 1, 1);
386 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
387
389 nblcks, nthrds, 0, strm, (real *) a,
390 (real *) b, *c1, *n);
392 }
393
399 void hip_add2s2(void *a, void *b, real *c1, int *n, hipStream_t strm) {
400
401 const dim3 nthrds(1024, 1, 1);
402 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
403
405 nblcks, nthrds, 0, strm, (real *) a, (real *) b,
406 *c1, *n);
408 }
409
416 void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n,
418
419 const dim3 nthrds(1024, 1, 1);
420 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
421
423 nblcks, nthrds, 0, strm, (real *) x,
424 (const real **) p, (real *) alpha, *j, *n);
426
427 }
428
434 void hip_addsqr2s2(void *a, void *b, real *c1, int *n, hipStream_t strm) {
435
436 const dim3 nthrds(1024, 1, 1);
437 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
438
440 nblcks, nthrds, 0, strm, (real *) a,
441 (real *) b, *c1, *n);
443 }
444
450 void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n,
452
453 const dim3 nthrds(1024, 1, 1);
454 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
455
457 nblcks, nthrds, 0, strm, (real *) a,
458 (real *) b, (real *) c, *c1, *c2, *n);
460 }
461
467 void hip_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2,
468 real *c3, int *n, hipStream_t strm) {
469
470 const dim3 nthrds(1024, 1, 1);
471 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
472
474 nblcks, nthrds, 0, strm, (real *) a,
475 (real *) b, (real *) c, (real *) d, *c1, *c2, *c3, *n);
477 }
478
484 void hip_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1,
485 real *c2, real *c3, real *c4, int *n, hipStream_t strm) {
486
487 const dim3 nthrds(1024, 1, 1);
488 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
489
491 nblcks, nthrds, 0, strm, (real *) a,
492 (real *) b, (real *) c, (real *) d, (real *) e,
493 *c1, *c2, *c3, *c4, *n);
495 }
496
501 void hip_invcol1(void *a, int *n, hipStream_t strm) {
502
503 const dim3 nthrds(1024, 1, 1);
504 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
505
507 nblcks, nthrds, 0, strm, (real *) a, *n);
509 }
510
515 void hip_invcol2(void *a, void *b, int *n, hipStream_t strm) {
516
517 const dim3 nthrds(1024, 1, 1);
518 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
519
521 nblcks, nthrds, 0, strm, (real *) a, (real *) b, *n);
523 }
524
529 void hip_invcol3(void *a, void *b, void *c, int *n, hipStream_t strm) {
530
531 const dim3 nthrds(1024, 1, 1);
532 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
533
535 nblcks, nthrds, 0, strm, (real *) a,
536 (real *) b, (real *) c, *n);
538 }
539
544 void hip_col2(void *a, void *b, int *n, hipStream_t strm) {
545
546 const dim3 nthrds(1024, 1, 1);
547 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
548
550 nblcks, nthrds, 0, strm, (real *) a,
551 (real *) b, *n);
553 }
554
559 void hip_col3(void *a, void *b, void *c, int *n, hipStream_t strm) {
560
561 const dim3 nthrds(1024, 1, 1);
562 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
563
565 nblcks, nthrds, 0, strm, (real *) a,
566 (real *) b, (real *) c, *n);
568 }
569
574 void hip_subcol3(void *a, void *b, void *c, int *n, hipStream_t strm) {
575
576 const dim3 nthrds(1024, 1, 1);
577 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
578
580 nblcks, nthrds, 0, strm, (real *) a,
581 (real *) b, (real *) c, *n);
583 }
584
589 void hip_sub2(void *a, void *b, int *n, hipStream_t strm) {
590
591 const dim3 nthrds(1024, 1, 1);
592 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
593
595 nblcks, nthrds, 0, strm, (real *) a,
596 (real *) b, *n);
598 }
599
604 void hip_sub3(void *a, void *b, void *c, int *n, hipStream_t strm) {
605
606 const dim3 nthrds(1024, 1, 1);
607 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
608
610 nblcks, nthrds, 0, strm, (real *) a,
611 (real *) b, (real *) c, *n);
613 }
614
619 void hip_addcol3(void *a, void *b, void *c, int *n, hipStream_t strm) {
620
621 const dim3 nthrds(1024, 1, 1);
622 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
623
625 nblcks, nthrds, 0, strm, (real *) a,
626 (real *) b, (real *) c, *n);
628 }
629
634 void hip_addcol4(void *a, void *b, void *c, void *d, int *n,
636
637 const dim3 nthrds(1024, 1, 1);
638 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
639
641 nblcks, nthrds, 0, strm, (real *) a,
642 (real *) b, (real *) c, (real *) d, *n);
644 }
645
650 void hip_addcol3s2(void *a, void *b, void *c, real *s, int *n,
652
653 const dim3 nthrds(1024, 1, 1);
654 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
655
657 nblcks, nthrds, 0, strm, (real *) a,
658 (real *) b, (real *) c, *s, *n);
660 }
661
666 void hip_vdot3(void *dot, void *u1, void *u2, void *u3,
667 void *v1, void *v2, void *v3, int *n,
669
670 const dim3 nthrds(1024, 1, 1);
671 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
672
674 nblcks, nthrds, 0, strm,
675 (real *) dot, (real *) u1, (real *) u2, (real *) u3,
676 (real *) v1, (real *) v2, (real *) v3, *n);
678 }
679
684 void hip_vcross(void *u1, void *u2, void *u3,
685 void *v1, void *v2, void *v3,
686 void *w1, void *w2, void *w3,
687 int *n, hipStream_t strm) {
688
689 const dim3 nthrds(1024, 1, 1);
690 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
691
693 nblcks, nthrds, 0, strm,
694 (real *) u1, (real *) u2, (real *) u3,
695 (real *) v1, (real *) v2, (real *) v3,
696 (real *) w1, (real *) w2, (real *) w3, *n);
698 }
699
700
701 /*
702 * Reduction buffer
703 */
704 int red_s = 0;
707
709 if ( nb >= red_s) {
710 red_s = nb+1;
711 if (bufred != NULL) {
714 }
717 }
718 }
719
724 const hipStream_t stream) {
725 #ifdef HAVE_RCCL
727 DEVICE_NCCL_SUM, stream);
729 hipMemcpyDeviceToHost, stream));
731 #elif HAVE_DEVICE_MPI
734 #else
736 hipMemcpyDeviceToHost, stream));
738 #endif
739 }
740
741
746 const hipStream_t stream) {
747 #ifdef HAVE_RCCL
749 DEVICE_NCCL_MAX, stream);
751 hipMemcpyDeviceToHost, stream));
753 #elif HAVE_DEVICE_MPI
756 #else
758 hipMemcpyDeviceToHost, stream));
760 #endif
761 }
762
767 const hipStream_t stream) {
768 #ifdef HAVE_RCCL
770 DEVICE_NCCL_MIN, stream);
772 hipMemcpyDeviceToHost, stream));
774 #elif HAVE_DEVICE_MPI
777 #else
779 hipMemcpyDeviceToHost, stream));
781 #endif
782 }
783
788 real hip_vlsc3(void *u, void *v, void *w, int *n, hipStream_t stream) {
789
790 const dim3 nthrds(1024, 1, 1);
791 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
792 const int nb = ((*n) + 1024 - 1)/ 1024;
793
795
796
798 nblcks, nthrds, 0, stream,
799 (real *) u, (real *) v,
800 (real *) w, bufred_d, *n);
803 1, 1024, 0, stream, bufred_d, nb);
805
807 hipMemcpyDeviceToHost, stream));
809
810 return bufred[0];
811 }
812
817 real hip_glsc3(void *a, void *b, void *c, int *n, hipStream_t stream) {
818
819 const dim3 nthrds(1024, 1, 1);
820 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
821 const int nb = ((*n) + 1024 - 1)/ 1024;
822
824
825 if (*n > 0) {
827 nblcks, nthrds, 0, stream,
828 (real *) a, (real *) b,
829 (real *) c, bufred_d, *n);
832 1, 1024, 0, stream, bufred_d, nb);
834 }
835 else {
836 hip_rzero(bufred_d, &red_s, stream);
837 }
839
840 return bufred[0];
841 }
842
847 void hip_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n,
848 hipStream_t stream){
849 int pow2 = 1;
850 while(pow2 < (*j)){
851 pow2 = 2*pow2;
852 }
853 const int nt = 1024/pow2;
854 const dim3 nthrds(pow2, nt, 1);
855 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
856 const dim3 nthrds_red(1024,1,1);
857 const dim3 nblcks_red( (*j),1,1);
858 const int nb = ((*n) + nt - 1)/nt;
859
861
862 if (*n > 0) {
864 nblcks, nthrds, 0, stream,
865 (const real *) w, (const real **) v,
866 (const real *)mult, bufred_d, *j, *n);
868
870 nblcks_red, nthrds_red, 0, stream,
871 bufred_d, nb, *j);
873 }
874 else {
875 hip_rzero(bufred_d, &red_s, stream);
876 }
877 hip_global_reduce_add(h, bufred_d, (*j), stream);
878 }
879
884 real hip_glsc2(void *a, void *b, int *n, hipStream_t stream) {
885
886 const dim3 nthrds(1024, 1, 1);
887 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
888 const int nb = ((*n) + 1024 - 1)/ 1024;
889
890
892
893 if( *n > 0) {
895 nblcks, nthrds, 0, stream,
896 (real *) a, (real *) b, bufred_d, *n);
899 1, 1024, 0, stream, bufred_d, nb);
901 }
902 else {
903 hip_rzero(bufred_d, &red_s, stream);
904 }
906
907 return bufred[0];
908 }
909
914 real hip_glsubnorm2(void* a, void* b, int* n, hipStream_t stream) {
915
916 const dim3 nthrds(1024, 1, 1);
917 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
918 const int nb = ((*n) + 1024 - 1) / 1024;
919
921
922 if (*n > 0) {
924 nblcks, nthrds, 0, stream,
925 (real*)a, (real*)b, bufred_d, *n);
928 1, 1024, 0, stream, bufred_d, nb);
930 }
931 else {
932 hip_rzero(bufred_d, &red_s, stream);
933 }
935
936 return bufred[0];
937 }
938
943 real hip_glsum(void *a, int *n, hipStream_t stream) {
944 const dim3 nthrds(1024, 1, 1);
945 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
946 const int nb = ((*n) + 1024 - 1)/ 1024;
947
949 if( *n > 0) {
951 nblcks, nthrds, 0, stream,
952 (real *) a, bufred_d, *n);
955 1, 1024, 0, stream, bufred_d, nb);
957 }
958 else {
959 hip_rzero(bufred_d, &red_s, stream);
960 }
961
963
964 return bufred[0];
965 }
966
967
972 real hip_glmax(void *a, real *ninf, int *n, hipStream_t stream) {
973 const dim3 nthrds(1024, 1, 1);
974 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
975 const int nb = ((*n) + 1024 - 1)/ 1024;
976
978 if( *n > 0) {
980 nblcks, nthrds, 0, stream,
981 (real *) a, *ninf, bufred_d, *n);
984 1, 1024, 0, stream, bufred_d, *ninf, nb);
986 }
987 else {
988 hip_rzero(bufred_d, &red_s, stream);
989 }
990
992
993 return bufred[0];
994 }
995
1000 real hip_glmin(void *a, real *pinf, int *n, hipStream_t stream) {
1001 const dim3 nthrds(1024, 1, 1);
1002 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1003 const int nb = ((*n) + 1024 - 1)/ 1024;
1004
1006 if( *n > 0) {
1008 nblcks, nthrds, 0, stream,
1009 (real *) a, *pinf, bufred_d, *n);
1012 1, 1024, 0, stream, bufred_d, *pinf, nb);
1014 }
1015 else {
1016 hip_rzero(bufred_d, &red_s, stream);
1017 }
1018
1020
1021 return bufred[0];
1022 }
1023
1028 void hip_absval(void *a, int *n, hipStream_t stream) {
1029
1030 const dim3 nthrds(1024, 1, 1);
1031 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1032
1034 nblcks, nthrds, 0, stream, (real *) a, *n);
1036
1037}
1038
1039 // ======================================================================== //
1040 // Point-wise operations.
1041
1046 void hip_pwmax_vec2(void* a, void* b, int* n, hipStream_t stream) {
1047
1048 const dim3 nthrds(1024, 1, 1);
1049 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1050
1052 nblcks, nthrds, 0, stream, (real*)a, (real*)b, *n);
1054}
1055
1060 void hip_pwmax_vec3(void *a, void *b, void *c, int *n, hipStream_t stream) {
1061
1062 const dim3 nthrds(1024, 1, 1);
1063 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1064
1066 nblcks, nthrds, 0, stream,
1067 (real *)a, (real *)b, (real *)c, *n);
1069 }
1070
1075 void hip_pwmax_sca2(void *a, real *c, int *n, hipStream_t stream) {
1076
1077 const dim3 nthrds(1024, 1, 1);
1078 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1079
1081 nblcks, nthrds, 0, stream, (real *)a, *c, *n);
1083 }
1084
1089 void hip_pwmax_sca3(void *a, void *b, real *c, int *n, hipStream_t stream) {
1090
1091 const dim3 nthrds(1024, 1, 1);
1092 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1093
1095 nblcks, nthrds, 0, stream,
1096 (real *)a, (real *)b, *c, *n);
1098 }
1099
1104 void hip_pwmin_vec2(void *a, void *b, int *n, hipStream_t stream) {
1105
1106 const dim3 nthrds(1024, 1, 1);
1107 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1108
1110 nblcks, nthrds, 0, stream,
1111 (real *)a, (real *)b, *n);
1113 }
1114
1119 void hip_pwmin_vec3(void *a, void *b, void *c, int *n, hipStream_t stream) {
1120
1121 const dim3 nthrds(1024, 1, 1);
1122 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1123
1125 nblcks, nthrds, 0, stream,
1126 (real *)a, (real *)b, (real *)c, *n);
1128 }
1129
1134 void hip_pwmin_sca2(void *a, real *c, int *n, hipStream_t stream) {
1135
1136 const dim3 nthrds(1024, 1, 1);
1137 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1138
1140 nblcks, nthrds, 0, stream,
1141 (real *)a, *c, *n);
1143 }
1144
1149 void hip_pwmin_sca3(void *a, void *b, real *c, int *n, hipStream_t stream) {
1150
1151 const dim3 nthrds(1024, 1, 1);
1152 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1153
1155 nblcks, nthrds, 0, stream,
1156 (real *)a, (real *)b, *c, *n);
1158 }
1159
1160 // ======================================================================== //
1161
1165 void hip_iadd(void *a, int *c, int *n, hipStream_t stream) {
1166
1167 const dim3 nthrds(1024, 1, 1);
1168 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1169
1171 nblcks, nthrds, 0, stream,
1172 (int *) a, *c, *n);
1174 }
1175
1176} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_MAX
#define DEVICE_MPI_MIN
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_MAX
#define DEVICE_NCCL_MIN
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
#define HIP_CHECK(err)
Definition check.h:8
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, hipStream_t strm)
Definition math.hip:666
void hip_global_reduce_add(real *bufred, void *bufred_d, int n, const hipStream_t stream)
Definition math.hip:723
void hip_col3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:559
void hip_cwrap(void *a, real *min_val, real *max_val, int *n, hipStream_t strm)
Definition math.hip:305
void hip_addsqr2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
Definition math.hip:434
real hip_glmax(void *a, real *ninf, int *n, hipStream_t stream)
Definition math.hip:972
void hip_rzero(void *a, int *n, hipStream_t strm)
Definition math.hip:214
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size, hipStream_t strm)
Definition math.hip:198
void hip_pwmax_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
Definition math.hip:1089
void hip_invcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:529
void hip_absval(void *a, int *n, hipStream_t stream)
Definition math.hip:1028
void hip_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:111
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, hipStream_t strm)
Definition math.hip:684
real hip_glsubnorm2(void *a, void *b, int *n, hipStream_t stream)
Definition math.hip:914
void hip_addcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:619
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, hipStream_t strm)
Definition math.hip:450
void hip_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:164
void hip_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, hipStream_t strm)
Definition math.hip:484
void hip_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:78
void hip_add2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
Definition math.hip:399
real hip_glsum(void *a, int *n, hipStream_t stream)
Definition math.hip:943
void hip_add4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
Definition math.hip:367
void hip_cadd2(void *a, void *b, real *c, int *n, hipStream_t strm)
Definition math.hip:291
real hip_vlsc3(void *u, void *v, void *w, int *n, hipStream_t stream)
Definition math.hip:788
void hip_cdiv2(void *a, void *b, real *c, int *n, hipStream_t strm)
Definition math.hip:263
void hip_pwmin_sca2(void *a, real *c, int *n, hipStream_t stream)
Definition math.hip:1134
void hip_cmult2(void *a, void *b, real *c, int *n, hipStream_t strm)
Definition math.hip:235
real hip_glsc3(void *a, void *b, void *c, int *n, hipStream_t stream)
Definition math.hip:817
void hip_invcol1(void *a, int *n, hipStream_t strm)
Definition math.hip:501
void hip_invcol2(void *a, void *b, int *n, hipStream_t strm)
Definition math.hip:515
void hip_global_reduce_max(real *bufred, void *bufred_d, int n, const hipStream_t stream)
Definition math.hip:745
void hip_cdiv(void *a, real *c, int *n, hipStream_t strm)
Definition math.hip:249
void hip_sub2(void *a, void *b, int *n, hipStream_t strm)
Definition math.hip:589
real hip_glsc2(void *a, void *b, int *n, hipStream_t stream)
Definition math.hip:884
real * bufred
Definition math.hip:705
void hip_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, hipStream_t strm)
Definition math.hip:467
void hip_col2(void *a, void *b, int *n, hipStream_t strm)
Definition math.hip:544
real hip_glmin(void *a, real *pinf, int *n, hipStream_t stream)
Definition math.hip:1000
void hip_pwmin_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
Definition math.hip:1119
void hip_addcol3s2(void *a, void *b, void *c, real *s, int *n, hipStream_t strm)
Definition math.hip:650
void hip_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:95
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, hipStream_t stream)
Definition math.hip:847
int red_s
Definition math.hip:704
void hip_cfill(void *a, real *c, int *n, hipStream_t strm)
Definition math.hip:320
void hip_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:181
void hip_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:61
void hip_add3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:352
void hip_pwmax_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
Definition math.hip:1060
void hip_add2(void *a, void *b, int *n, hipStream_t strm)
Definition math.hip:337
void hip_copy(void *a, void *b, int *n, hipStream_t strm)
Definition math.hip:53
void hip_subcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:574
void hip_pwmax_sca2(void *a, real *c, int *n, hipStream_t stream)
Definition math.hip:1075
void hip_iadd(void *a, int *c, int *n, hipStream_t stream)
Definition math.hip:1165
void hip_pwmin_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
Definition math.hip:1149
void hip_add2s1(void *a, void *b, real *c1, int *n, hipStream_t strm)
Definition math.hip:383
void hip_radd(void *a, real *c, int *n, hipStream_t strm)
Definition math.hip:277
void hip_redbuf_check_alloc(int nb)
Definition math.hip:708
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, hipStream_t strm)
Definition math.hip:416
void hip_addcol4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
Definition math.hip:634
void hip_pwmax_vec2(void *a, void *b, int *n, hipStream_t stream)
Definition math.hip:1046
void hip_pwmin_vec2(void *a, void *b, int *n, hipStream_t stream)
Definition math.hip:1104
void hip_sub3(void *a, void *b, void *c, int *n, hipStream_t strm)
Definition math.hip:604
void hip_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, hipStream_t strm)
Definition math.hip:127
void hip_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
Definition math.hip:147
void hip_cmult(void *a, real *c, int *n, hipStream_t strm)
Definition math.hip:221
real * bufred_d
Definition math.hip:706
void hip_global_reduce_min(real *bufred, void *bufred_d, int n, const hipStream_t stream)
Definition math.hip:766
Object for handling masks in Neko.
Definition mask.f90:34