Loading [MathJax]/extensions/tex2jax.js
Neko 0.9.99
A portable framework for high-order spectral element flow simulations
All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros Pages
math.hip
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include <hip/hip_runtime.h>
37#include <device/hip/check.h>
38#include "math_kernel.h"
39
40extern "C" {
41
44
45#ifdef HAVE_RCCL
48#endif
49
53 void hip_copy(void *a, void *b, int *n) {
54 HIP_CHECK(hipMemcpyAsync(a, b, (*n) * sizeof(real),
56 (hipStream_t) glb_cmd_queue));
57 }
58
62 void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
63
64 const dim3 nthrds(1024, 1, 1);
65 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
66
68 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
69 (real *) a, (real *) b, (int *) mask, *n, *m);
70
72
73 }
74
75
79 void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
80
81 const dim3 nthrds(1024, 1, 1);
82 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
83
85 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
86 (real *) a, (real *) b, (int *) mask, *n, *m);
87
89
90 }
91
95 void hip_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m) {
96
97 const dim3 nthrds(1024, 1, 1);
98 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
99
101 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
102 (real *) a, (real *) b, (int *) mask, *n, *m);
103
105
106 }
107
111 void hip_cfill_mask(void* a, real* c, int* size, void* mask, int* mask_size) {
112
113 const dim3 nthrds(1024, 1, 1);
114 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
115
117 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
118 (real*)a, *c, *size, (int*)mask, *mask_size);
119
121 }
122
126 void hip_rzero(void *a, int *n) {
127 HIP_CHECK(hipMemsetAsync(a, 0, (*n) * sizeof(real),
128 (hipStream_t) glb_cmd_queue));
129 }
130
134 void hip_cmult(void *a, real *c, int *n) {
135
136 const dim3 nthrds(1024, 1, 1);
137 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
138
140 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
141 (real *) a, *c, *n);
143
144 }
145
149 void hip_cmult2(void *a, void *b, real *c, int *n) {
150
151 const dim3 nthrds(1024, 1, 1);
152 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
153
155 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
156 (real *) a,(real *) b, *c, *n);
158
159 }
163 void hip_cadd(void *a, real *c, int *n) {
164
165 const dim3 nthrds(1024, 1, 1);
166 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
167
169 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
170 (real *) a, *c, *n);
172 }
173
178 void hip_cadd2(void *a, void *b, real *c, int *n) {
179
180 const dim3 nthrds(1024, 1, 1);
181 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
182
184 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
185 (real *) a, (real *) b, *c, *n);
187 }
188
192 void hip_cfill(void *a, real *c, int *n) {
193
194 const dim3 nthrds(1024, 1, 1);
195 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
196 if (*n > 0) {
198 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
199 (real *) a, *c, *n);
201 }
202 }
203
208 void hip_add2(void *a, void *b, int *n) {
209
210 const dim3 nthrds(1024, 1, 1);
211 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
212
214 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
215 (real *) a, (real *) b, *n);
217 }
218
223 void hip_add4(void *a, void *b, void *c, void *d, int *n) {
224
225 const dim3 nthrds(1024, 1, 1);
226 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
227
229 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
230 (real *) a, (real *) b, (real *) c, (real *) d, *n);
232 }
233
239 void hip_add2s1(void *a, void *b, real *c1, int *n) {
240
241 const dim3 nthrds(1024, 1, 1);
242 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
243
245 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
246 (real *) a, (real *) b,
247 *c1, *n);
249 }
250
256 void hip_add2s2(void *a, void *b, real *c1, int *n) {
257
258 const dim3 nthrds(1024, 1, 1);
259 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
260
262 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
263 (real *) a, (real *) b,
264 *c1, *n);
266 }
267
273 void hip_addsqr2s2(void *a, void *b, real *c1, int *n) {
274
275 const dim3 nthrds(1024, 1, 1);
276 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
277
279 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
280 (real *) a, (real *) b,
281 *c1, *n);
283 }
284
290 void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294
296 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
297 (real *) a, (real *) b, (real *) c,
298 *c1, *c2, *n);
300 }
301
306 void hip_invcol1(void *a, int *n) {
307
308 const dim3 nthrds(1024, 1, 1);
309 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
310
312 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
313 (real *) a, *n);
315 }
316
321 void hip_invcol2(void *a, void *b, int *n) {
322
323 const dim3 nthrds(1024, 1, 1);
324 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
325
327 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
328 (real *) a, (real *) b, *n);
330 }
331
336 void hip_col2(void *a, void *b, int *n) {
337
338 const dim3 nthrds(1024, 1, 1);
339 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
340
342 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
343 (real *) a, (real *) b, *n);
345 }
346
351 void hip_col3(void *a, void *b, void *c, int *n) {
352
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355
357 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
358 (real *) a, (real *) b, (real *) c, *n);
360 }
361
366 void hip_subcol3(void *a, void *b, void *c, int *n) {
367
368 const dim3 nthrds(1024, 1, 1);
369 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
370
372 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
373 (real *) a, (real *) b, (real *) c, *n);
375 }
376
381 void hip_sub2(void *a, void *b, int *n) {
382
383 const dim3 nthrds(1024, 1, 1);
384 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
385
387 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
388 (real *) a, (real *) b, *n);
390 }
391
396 void hip_sub3(void *a, void *b, void *c, int *n) {
397
398 const dim3 nthrds(1024, 1, 1);
399 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
400
402 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
403 (real *) a, (real *) b, (real *) c, *n);
405 }
406
411 void hip_addcol3(void *a, void *b, void *c, int *n) {
412
413 const dim3 nthrds(1024, 1, 1);
414 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
415
417 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
418 (real *) a, (real *) b, (real *) c, *n);
420 }
421
426 void hip_addcol4(void *a, void *b, void *c, void *d, int *n) {
427
428 const dim3 nthrds(1024, 1, 1);
429 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
430
432 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
433 (real *) a, (real *) b, (real *) c, (real *) d, *n);
435 }
436
441 void hip_vdot3(void *dot, void *u1, void *u2, void *u3,
442 void *v1, void *v2, void *v3, int *n) {
443
444 const dim3 nthrds(1024, 1, 1);
445 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
446
448 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
449 (real *) dot, (real *) u1, (real *) u2, (real *) u3,
450 (real *) v1, (real *) v2, (real *) v3, *n);
452 }
453
458 void hip_vcross(void *u1, void *u2, void *u3,
459 void *v1, void *v2, void *v3,
460 void *w1, void *w2, void *w3, int *n) {
461
462 const dim3 nthrds(1024, 1, 1);
463 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
464
466 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
467 (real *) u1, (real *) u2, (real *) u3,
468 (real *) v1, (real *) v2, (real *) v3,
469 (real *) w1, (real *) w2, (real *) w3, *n);
471 }
472
473
474 /*
475 * Reduction buffer
476 */
477 int red_s = 0;
480
485 real hip_vlsc3(void *u, void *v, void *w, int *n) {
486
487 const dim3 nthrds(1024, 1, 1);
488 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
489 const int nb = ((*n) + 1024 - 1)/ 1024;
490 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
491
492 if ( nb > red_s){
493 red_s = nb;
494 if (bufred != NULL) {
497 }
499 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
500 }
501
503 nblcks, nthrds, 0, stream,
504 (real *) u, (real *) v,
505 (real *) w, bufred_d, *n);
508 1, 1024, 0, stream, bufred_d, nb);
510
512 hipMemcpyDeviceToHost, stream));
514
515 return bufred[0];
516 }
517
518
523 real hip_glsc3(void *a, void *b, void *c, int *n) {
524
525 const dim3 nthrds(1024, 1, 1);
526 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
527 const int nb = ((*n) + 1024 - 1)/ 1024;
528 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
529
530 if ( nb > red_s){
531 red_s = nb;
532 if (bufred != NULL) {
535 }
537 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
538 }
539
541 nblcks, nthrds, 0, stream,
542 (real *) a, (real *) b,
543 (real *) c, bufred_d, *n);
546 1, 1024, 0, stream, bufred_d, nb);
548
549#ifdef HAVE_RCCL
551 DEVICE_NCCL_SUM, stream);
553 hipMemcpyDeviceToHost, stream));
555#elif HAVE_DEVICE_MPI
558#else
560 hipMemcpyDeviceToHost, stream));
562#endif
563 return bufred[0];
564 }
565
570 void hip_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
571 int pow2 = 1;
572 while(pow2 < (*j)){
573 pow2 = 2*pow2;
574 }
575 const int nt = 1024/pow2;
576 const dim3 nthrds(pow2, nt, 1);
577 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
578 const dim3 nthrds_red(1024,1,1);
579 const dim3 nblcks_red( (*j),1,1);
580 const int nb = ((*n) + nt - 1)/nt;
581 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
582
583 if((*j)*nb>red_s){
584 red_s = (*j)*nb;
585 if (bufred != NULL) {
588 }
590 HIP_CHECK(hipMalloc(&bufred_d, (*j)*nb*sizeof(real)));
591 }
593 nblcks, nthrds, 0, stream,
594 (const real *) w, (const real **) v,
595 (const real *)mult, bufred_d, *j, *n);
597
599 nblcks_red, nthrds_red, 0, stream,
600 bufred_d, nb, *j);
602
603#ifdef HAVE_RCCL
605 DEVICE_NCCL_SUM, stream);
606 HIP_CHECK(hipMemcpyAsync(h, bufred_d, (*j)* sizeof(real),
607 hipMemcpyDeviceToHost, stream));
609#elif HAVE_DEVICE_MPI
612#else
613 HIP_CHECK(hipMemcpyAsync(h, bufred_d, (*j)* sizeof(real),
614 hipMemcpyDeviceToHost, stream));
616#endif
617 }
618
625 void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
626
627 const dim3 nthrds(1024, 1, 1);
628 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
629
631 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
632 (real *) x, (const real **) p, (real *) alpha, *j, *n);
634
635 }
636
641 void hip_add3(void *a, void *b, void *c, int *n) {
642
643 const dim3 nthrds(1024, 1, 1);
644 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
645
647 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
648 (real *) a, (real *) b, (real *) c, *n);
650 }
651
656 real hip_glsc2(void *a, void *b, int *n) {
657
658 const dim3 nthrds(1024, 1, 1);
659 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
660 const int nb = ((*n) + 1024 - 1)/ 1024;
661 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
662
663 if ( nb > red_s){
664 red_s = nb;
665 if (bufred != NULL) {
668 }
670 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
671 }
672
674 nblcks, nthrds, 0, stream,
675 (real *) a, (real *) b, bufred_d, *n);
678 1, 1024, 0, stream, bufred_d, nb);
680
681#ifdef HAVE_RCCL
683 DEVICE_NCCL_SUM, stream);
685 hipMemcpyDeviceToHost, stream));
687#elif HAVE_DEVICE_MPI
690#else
692 hipMemcpyDeviceToHost, stream));
694#endif
695 return bufred[0];
696 }
697
702 real hip_glsum(void *a, int *n) {
703 const dim3 nthrds(1024, 1, 1);
704 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
705 const int nb = ((*n) + 1024 - 1)/ 1024;
706 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
707
708 if ( nb > red_s){
709 red_s = nb;
710 if (bufred != NULL) {
713 }
715 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
716 }
717 if( *n > 0) {
719 nblcks, nthrds, 0, stream,
720 (real *) a, bufred_d, *n);
723 1, 1024, 0, stream, bufred_d, nb);
725 }
726
727#ifdef HAVE_RCCL
729 DEVICE_NCCL_SUM, stream);
731 hipMemcpyDeviceToHost, stream));
733#elif HAVE_DEVICE_MPI
736#else
738 hipMemcpyDeviceToHost, stream));
740#endif
741 return bufred[0];
742 }
743
747 void hip_absval(void *a, int *n) {
748
749 const dim3 nthrds(1024, 1, 1);
750 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
751
753 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
754 (real *) a, *n);
756
757 }
758}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define DEVICE_NCCL_SUM
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
#define HIP_CHECK(err)
Definition check.h:8
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition math.hip:441
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition math.hip:458
void hip_cmult2(void *a, void *b, real *c, int *n)
Definition math.hip:149
real hip_glsc3(void *a, void *b, void *c, int *n)
Definition math.hip:523
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size)
Definition math.hip:111
void hip_invcol2(void *a, void *b, int *n)
Definition math.hip:321
void hip_cadd2(void *a, void *b, real *c, int *n)
Definition math.hip:178
void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.hip:79
void hip_invcol1(void *a, int *n)
Definition math.hip:306
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition math.hip:290
void hip_subcol3(void *a, void *b, void *c, int *n)
Definition math.hip:366
void hip_col3(void *a, void *b, void *c, int *n)
Definition math.hip:351
real hip_glsc2(void *a, void *b, int *n)
Definition math.hip:656
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.hip:62
void hip_copy(void *a, void *b, int *n)
Definition math.hip:53
void hip_add2(void *a, void *b, int *n)
Definition math.hip:208
real hip_vlsc3(void *u, void *v, void *w, int *n)
Definition math.hip:485
void hip_add3(void *a, void *b, void *c, int *n)
Definition math.hip:641
real * bufred
Definition math.hip:478
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition math.hip:273
void hip_add2s2(void *a, void *b, real *c1, int *n)
Definition math.hip:256
void hip_rzero(void *a, int *n)
Definition math.hip:126
void hip_sub2(void *a, void *b, int *n)
Definition math.hip:381
void hip_cadd(void *a, real *c, int *n)
Definition math.hip:163
real hip_glsum(void *a, int *n)
Definition math.hip:702
int red_s
Definition math.hip:477
void hip_addcol3(void *a, void *b, void *c, int *n)
Definition math.hip:411
void hip_cfill(void *a, real *c, int *n)
Definition math.hip:192
void hip_absval(void *a, int *n)
Definition math.hip:747
void hip_col2(void *a, void *b, int *n)
Definition math.hip:336
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition math.hip:570
void hip_sub3(void *a, void *b, void *c, int *n)
Definition math.hip:396
void hip_add4(void *a, void *b, void *c, void *d, int *n)
Definition math.hip:223
void hip_cmult(void *a, real *c, int *n)
Definition math.hip:134
void hip_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m)
Definition math.hip:95
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition math.hip:625
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition math.hip:426
real * bufred_d
Definition math.hip:479
void hip_add2s1(void *a, void *b, real *c1, int *n)
Definition math.hip:239