Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.hip
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2023, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include <hip/hip_runtime.h>
37#include <device/hip/check.h>
38#include "math_kernel.h"
39
40extern "C" {
41
44
48 void hip_copy(void *a, void *b, int *n) {
49 HIP_CHECK(hipMemcpyAsync(a, b, (*n) * sizeof(real),
51 (hipStream_t) glb_cmd_queue));
52 }
53
57 void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
58
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
61
63 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
64 (real *) a, (real *) b, (int *) mask, *n, *m);
65
67
68 }
69
70
74 void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
75
76 const dim3 nthrds(1024, 1, 1);
77 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
78
80 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
81 (real *) a, (real *) b, (int *) mask, *n, *m);
82
84
85 }
86
90 void hip_cfill_mask(void* a, real* c, int* size, void* mask, int* mask_size) {
91
92 const dim3 nthrds(1024, 1, 1);
93 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
94
96 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
97 (real*)a, *c, *size, (int*)mask, *mask_size);
98
100 }
101
105 void hip_rzero(void *a, int *n) {
106 HIP_CHECK(hipMemsetAsync(a, 0, (*n) * sizeof(real),
107 (hipStream_t) glb_cmd_queue));
108 }
109
113 void hip_cmult(void *a, real *c, int *n) {
114
115 const dim3 nthrds(1024, 1, 1);
116 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
117
119 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
120 (real *) a, *c, *n);
122
123 }
124
128 void hip_cmult2(void *a, void *b, real *c, int *n) {
129
130 const dim3 nthrds(1024, 1, 1);
131 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
132
134 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
135 (real *) a,(real *) b, *c, *n);
137
138 }
142 void hip_cadd(void *a, real *c, int *n) {
143
144 const dim3 nthrds(1024, 1, 1);
145 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
146
148 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
149 (real *) a, *c, *n);
151 }
152
157 void hip_cadd2(void *a, void *b, real *c, int *n) {
158
159 const dim3 nthrds(1024, 1, 1);
160 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
161
163 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
164 (real *) a, (real *) b, *c, *n);
166 }
167
171 void hip_cfill(void *a, real *c, int *n) {
172
173 const dim3 nthrds(1024, 1, 1);
174 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
175 if (*n > 0) {
177 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
178 (real *) a, *c, *n);
180 }
181 }
182
187 void hip_add2(void *a, void *b, int *n) {
188
189 const dim3 nthrds(1024, 1, 1);
190 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
191
193 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
194 (real *) a, (real *) b, *n);
196 }
197
202 void hip_add4(void *a, void *b, void *c, void *d, int *n) {
203
204 const dim3 nthrds(1024, 1, 1);
205 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
206
208 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
209 (real *) a, (real *) b, (real *) c, (real *) d, *n);
211 }
212
218 void hip_add2s1(void *a, void *b, real *c1, int *n) {
219
220 const dim3 nthrds(1024, 1, 1);
221 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
222
224 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
225 (real *) a, (real *) b,
226 *c1, *n);
228 }
229
235 void hip_add2s2(void *a, void *b, real *c1, int *n) {
236
237 const dim3 nthrds(1024, 1, 1);
238 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
239
241 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
242 (real *) a, (real *) b,
243 *c1, *n);
245 }
246
252 void hip_addsqr2s2(void *a, void *b, real *c1, int *n) {
253
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
256
258 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
259 (real *) a, (real *) b,
260 *c1, *n);
262 }
263
269 void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
270
271 const dim3 nthrds(1024, 1, 1);
272 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
273
275 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
276 (real *) a, (real *) b, (real *) c,
277 *c1, *c2, *n);
279 }
280
285 void hip_invcol1(void *a, int *n) {
286
287 const dim3 nthrds(1024, 1, 1);
288 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
289
291 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
292 (real *) a, *n);
294 }
295
300 void hip_invcol2(void *a, void *b, int *n) {
301
302 const dim3 nthrds(1024, 1, 1);
303 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
304
306 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
307 (real *) a, (real *) b, *n);
309 }
310
315 void hip_col2(void *a, void *b, int *n) {
316
317 const dim3 nthrds(1024, 1, 1);
318 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
319
321 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
322 (real *) a, (real *) b, *n);
324 }
325
330 void hip_col3(void *a, void *b, void *c, int *n) {
331
332 const dim3 nthrds(1024, 1, 1);
333 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
334
336 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
337 (real *) a, (real *) b, (real *) c, *n);
339 }
340
345 void hip_subcol3(void *a, void *b, void *c, int *n) {
346
347 const dim3 nthrds(1024, 1, 1);
348 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
349
351 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
352 (real *) a, (real *) b, (real *) c, *n);
354 }
355
360 void hip_sub2(void *a, void *b, int *n) {
361
362 const dim3 nthrds(1024, 1, 1);
363 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
364
366 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
367 (real *) a, (real *) b, *n);
369 }
370
375 void hip_sub3(void *a, void *b, void *c, int *n) {
376
377 const dim3 nthrds(1024, 1, 1);
378 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
379
381 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
382 (real *) a, (real *) b, (real *) c, *n);
384 }
385
390 void hip_addcol3(void *a, void *b, void *c, int *n) {
391
392 const dim3 nthrds(1024, 1, 1);
393 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
394
396 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
397 (real *) a, (real *) b, (real *) c, *n);
399 }
400
405 void hip_addcol4(void *a, void *b, void *c, void *d, int *n) {
406
407 const dim3 nthrds(1024, 1, 1);
408 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
409
411 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
412 (real *) a, (real *) b, (real *) c, (real *) d, *n);
414 }
415
420 void hip_vdot3(void *dot, void *u1, void *u2, void *u3,
421 void *v1, void *v2, void *v3, int *n) {
422
423 const dim3 nthrds(1024, 1, 1);
424 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
425
427 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
428 (real *) dot, (real *) u1, (real *) u2, (real *) u3,
429 (real *) v1, (real *) v2, (real *) v3, *n);
431 }
432
437 void hip_vcross(void *u1, void *u2, void *u3,
438 void *v1, void *v2, void *v3,
439 void *w1, void *w2, void *w3, int *n) {
440
441 const dim3 nthrds(1024, 1, 1);
442 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
443
445 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
446 (real *) u1, (real *) u2, (real *) u3,
447 (real *) v1, (real *) v2, (real *) v3,
448 (real *) w1, (real *) w2, (real *) w3, *n);
450 }
451
452
453 /*
454 * Reduction buffer
455 */
456 int red_s = 0;
459
464 real hip_vlsc3(void *u, void *v, void *w, int *n) {
465
466 const dim3 nthrds(1024, 1, 1);
467 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
468 const int nb = ((*n) + 1024 - 1)/ 1024;
469 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
470
471 if ( nb > red_s){
472 red_s = nb;
473 if (bufred != NULL) {
476 }
478 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
479 }
480
482 nblcks, nthrds, 0, stream,
483 (real *) u, (real *) v,
484 (real *) w, bufred_d, *n);
487 1, 1024, 0, stream, bufred_d, nb);
489
491 hipMemcpyDeviceToHost, stream));
493
494 return bufred[0];
495 }
496
497
502 real hip_glsc3(void *a, void *b, void *c, int *n) {
503
504 const dim3 nthrds(1024, 1, 1);
505 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
506 const int nb = ((*n) + 1024 - 1)/ 1024;
507 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
508
509 if ( nb > red_s){
510 red_s = nb;
511 if (bufred != NULL) {
514 }
516 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
517 }
518
520 nblcks, nthrds, 0, stream,
521 (real *) a, (real *) b,
522 (real *) c, bufred_d, *n);
525 1, 1024, 0, stream, bufred_d, nb);
527
528#ifdef HAVE_DEVICE_MPI
531#else
533 hipMemcpyDeviceToHost, stream));
535#endif
536 return bufred[0];
537 }
538
543 void hip_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
544 int pow2 = 1;
545 while(pow2 < (*j)){
546 pow2 = 2*pow2;
547 }
548 const int nt = 1024/pow2;
549 const dim3 nthrds(pow2, nt, 1);
550 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
551 const dim3 nthrds_red(1024,1,1);
552 const dim3 nblcks_red( (*j),1,1);
553 const int nb = ((*n) + nt - 1)/nt;
554 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
555
556 if((*j)*nb>red_s){
557 red_s = (*j)*nb;
558 if (bufred != NULL) {
561 }
563 HIP_CHECK(hipMalloc(&bufred_d, (*j)*nb*sizeof(real)));
564 }
566 nblcks, nthrds, 0, stream,
567 (const real *) w, (const real **) v,
568 (const real *)mult, bufred_d, *j, *n);
570
572 nblcks_red, nthrds_red, 0, stream,
573 bufred_d, nb, *j);
575
576#ifdef HAVE_DEVICE_MPI
579#else
580 HIP_CHECK(hipMemcpyAsync(h, bufred_d, (*j)* sizeof(real),
581 hipMemcpyDeviceToHost, stream));
583#endif
584 }
585
592 void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
593
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
596
598 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
599 (real *) x, (const real **) p, (real *) alpha, *j, *n);
601
602 }
603
608 void hip_add3(void *a, void *b, void *c, int *n) {
609
610 const dim3 nthrds(1024, 1, 1);
611 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
612
614 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
615 (real *) a, (real *) b, (real *) c, *n);
617 }
618
623 real hip_glsc2(void *a, void *b, int *n) {
624
625 const dim3 nthrds(1024, 1, 1);
626 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
627 const int nb = ((*n) + 1024 - 1)/ 1024;
628 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
629
630 if ( nb > red_s){
631 red_s = nb;
632 if (bufred != NULL) {
635 }
637 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
638 }
639
641 nblcks, nthrds, 0, stream,
642 (real *) a, (real *) b, bufred_d, *n);
645 1, 1024, 0, stream, bufred_d, nb);
647
648#ifdef HAVE_DEVICE_MPI
651#else
653 hipMemcpyDeviceToHost, stream));
655#endif
656 return bufred[0];
657 }
658
663 real hip_glsum(void *a, int *n) {
664 const dim3 nthrds(1024, 1, 1);
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666 const int nb = ((*n) + 1024 - 1)/ 1024;
667 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
668
669 if ( nb > red_s){
670 red_s = nb;
671 if (bufred != NULL) {
674 }
676 HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
677 }
678 if( *n > 0) {
680 nblcks, nthrds, 0, stream,
681 (real *) a, bufred_d, *n);
684 1, 1024, 0, stream, bufred_d, nb);
686 }
687#ifdef HAVE_DEVICE_MPI
690#else
692 hipMemcpyDeviceToHost, stream));
694#endif
695 return bufred[0];
696 }
697
701 void hip_absval(void *a, int *n) {
702
703 const dim3 nthrds(1024, 1, 1);
704 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
705
707 nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
708 (real *) a, *n);
710
711 }
712}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define HIP_CHECK(err)
Definition check.h:8
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition math.hip:420
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition math.hip:437
void hip_cmult2(void *a, void *b, real *c, int *n)
Definition math.hip:128
real hip_glsc3(void *a, void *b, void *c, int *n)
Definition math.hip:502
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size)
Definition math.hip:90
void hip_invcol2(void *a, void *b, int *n)
Definition math.hip:300
void hip_cadd2(void *a, void *b, real *c, int *n)
Definition math.hip:157
void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.hip:74
void hip_invcol1(void *a, int *n)
Definition math.hip:285
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition math.hip:269
void hip_subcol3(void *a, void *b, void *c, int *n)
Definition math.hip:345
void hip_col3(void *a, void *b, void *c, int *n)
Definition math.hip:330
real hip_glsc2(void *a, void *b, int *n)
Definition math.hip:623
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.hip:57
void hip_copy(void *a, void *b, int *n)
Definition math.hip:48
void hip_add2(void *a, void *b, int *n)
Definition math.hip:187
real hip_vlsc3(void *u, void *v, void *w, int *n)
Definition math.hip:464
void hip_add3(void *a, void *b, void *c, int *n)
Definition math.hip:608
real * bufred
Definition math.hip:457
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition math.hip:252
void hip_add2s2(void *a, void *b, real *c1, int *n)
Definition math.hip:235
void hip_rzero(void *a, int *n)
Definition math.hip:105
void hip_sub2(void *a, void *b, int *n)
Definition math.hip:360
void hip_cadd(void *a, real *c, int *n)
Definition math.hip:142
real hip_glsum(void *a, int *n)
Definition math.hip:663
int red_s
Definition math.hip:456
void hip_addcol3(void *a, void *b, void *c, int *n)
Definition math.hip:390
void hip_cfill(void *a, real *c, int *n)
Definition math.hip:171
void hip_absval(void *a, int *n)
Definition math.hip:701
void hip_col2(void *a, void *b, int *n)
Definition math.hip:315
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition math.hip:543
void hip_sub3(void *a, void *b, void *c, int *n)
Definition math.hip:375
void hip_add4(void *a, void *b, void *c, void *d, int *n)
Definition math.hip:202
void hip_cmult(void *a, real *c, int *n)
Definition math.hip:113
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition math.hip:592
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition math.hip:405
real * bufred_d
Definition math.hip:458
void hip_add2s1(void *a, void *b, real *c1, int *n)
Definition math.hip:218