35 #include <hip/hip_runtime.h>
50 hipMemcpyDeviceToDevice,
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_copy_kernel<real>),
64 (
real *) a, (
real *) b, (
int *) mask, *n, *m);
76 const dim3 nthrds(1024, 1, 1);
77 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
79 hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_red_copy_kernel<real>),
81 (
real *) a, (
real *) b, (
int *) mask, *n, *m);
92 const dim3 nthrds(1024, 1, 1);
93 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
95 hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_mask_kernel<real>),
97 (
real*)a, *c, *size, (
int*)mask, *mask_size);
115 const dim3 nthrds(1024, 1, 1);
116 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
118 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult_kernel<real>),
130 const dim3 nthrds(1024, 1, 1);
131 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
133 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult2_kernel<real>),
144 const dim3 nthrds(1024, 1, 1);
145 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
147 hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd_kernel<real>),
159 const dim3 nthrds(1024, 1, 1);
160 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
162 hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd2_kernel<real>),
173 const dim3 nthrds(1024, 1, 1);
174 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
176 hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_kernel<real>),
189 const dim3 nthrds(1024, 1, 1);
190 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
192 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2_kernel<real>),
202 void hip_add4(
void *a,
void *b,
void *c,
void *d,
int *n) {
204 const dim3 nthrds(1024, 1, 1);
205 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
207 hipLaunchKernelGGL(HIP_KERNEL_NAME(add4_kernel<real>),
220 const dim3 nthrds(1024, 1, 1);
221 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
223 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s1_kernel<real>),
237 const dim3 nthrds(1024, 1, 1);
238 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
240 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_kernel<real>),
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
257 hipLaunchKernelGGL(HIP_KERNEL_NAME(addsqr2s2_kernel<real>),
271 const dim3 nthrds(1024, 1, 1);
272 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
274 hipLaunchKernelGGL(HIP_KERNEL_NAME(add3s2_kernel<real>),
287 const dim3 nthrds(1024, 1, 1);
288 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
290 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol1_kernel<real>),
302 const dim3 nthrds(1024, 1, 1);
303 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
305 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol2_kernel<real>),
317 const dim3 nthrds(1024, 1, 1);
318 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
320 hipLaunchKernelGGL(HIP_KERNEL_NAME(col2_kernel<real>),
332 const dim3 nthrds(1024, 1, 1);
333 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
335 hipLaunchKernelGGL(HIP_KERNEL_NAME(col3_kernel<real>),
347 const dim3 nthrds(1024, 1, 1);
348 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
350 hipLaunchKernelGGL(HIP_KERNEL_NAME(subcol3_kernel<real>),
362 const dim3 nthrds(1024, 1, 1);
363 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
365 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub2_kernel<real>),
377 const dim3 nthrds(1024, 1, 1);
378 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
380 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub3_kernel<real>),
392 const dim3 nthrds(1024, 1, 1);
393 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
395 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol3_kernel<real>),
407 const dim3 nthrds(1024, 1, 1);
408 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
410 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol4_kernel<real>),
421 void *v1,
void *v2,
void *v3,
int *n) {
423 const dim3 nthrds(1024, 1, 1);
424 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
426 hipLaunchKernelGGL(HIP_KERNEL_NAME(vdot3_kernel<real>),
438 void *v1,
void *v2,
void *v3,
439 void *w1,
void *w2,
void *
w3,
int *n) {
441 const dim3 nthrds(1024, 1, 1);
442 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444 hipLaunchKernelGGL(HIP_KERNEL_NAME(vcross_kernel<real>),
466 const dim3 nthrds(1024, 1, 1);
467 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
468 const int nb = ((*n) + 1024 - 1)/ 1024;
481 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
482 nblcks, nthrds, 0, stream,
486 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
491 hipMemcpyDeviceToHost, stream));
504 const dim3 nthrds(1024, 1, 1);
505 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
506 const int nb = ((*n) + 1024 - 1)/ 1024;
519 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
520 nblcks, nthrds, 0, stream,
524 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
528 #ifdef HAVE_DEVICE_MPI
533 hipMemcpyDeviceToHost, stream));
548 const int nt = 1024/pow2;
549 const dim3 nthrds(pow2, nt, 1);
550 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
551 const dim3 nthrds_red(1024,1,1);
552 const dim3 nblcks_red( (*
j),1,1);
553 const int nb = ((*n) + nt - 1)/nt;
565 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_many_kernel<real>),
566 nblcks, nthrds, 0, stream,
571 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_reduce_kernel<real>),
572 nblcks_red, nthrds_red, 0, stream,
576 #ifdef HAVE_DEVICE_MPI
581 hipMemcpyDeviceToHost, stream));
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
597 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_many_kernel<real>),
610 const dim3 nthrds(1024, 1, 1);
611 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
613 hipLaunchKernelGGL(HIP_KERNEL_NAME(add3_kernel<real>),
625 const dim3 nthrds(1024, 1, 1);
626 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
627 const int nb = ((*n) + 1024 - 1)/ 1024;
640 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc2_kernel<real>),
641 nblcks, nthrds, 0, stream,
644 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
648 #ifdef HAVE_DEVICE_MPI
653 hipMemcpyDeviceToHost, stream));
664 const dim3 nthrds(1024, 1, 1);
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666 const int nb = ((*n) + 1024 - 1)/ 1024;
679 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsum_kernel<real>),
680 nblcks, nthrds, 0, stream,
683 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
687 #ifdef HAVE_DEVICE_MPI
692 hipMemcpyDeviceToHost, stream));
703 const dim3 nthrds(1024, 1, 1);
704 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
706 hipLaunchKernelGGL(HIP_KERNEL_NAME(absval_kernel<real>),
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
void hip_cmult2(void *a, void *b, real *c, int *n)
real hip_glsc3(void *a, void *b, void *c, int *n)
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size)
void hip_invcol2(void *a, void *b, int *n)
void hip_cadd2(void *a, void *b, real *c, int *n)
void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
void hip_invcol1(void *a, int *n)
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void hip_subcol3(void *a, void *b, void *c, int *n)
void hip_col3(void *a, void *b, void *c, int *n)
real hip_glsc2(void *a, void *b, int *n)
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void hip_copy(void *a, void *b, int *n)
void hip_add2(void *a, void *b, int *n)
real hip_vlsc3(void *u, void *v, void *w, int *n)
void hip_add3(void *a, void *b, void *c, int *n)
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
void hip_add2s2(void *a, void *b, real *c1, int *n)
void hip_rzero(void *a, int *n)
void hip_sub2(void *a, void *b, int *n)
void hip_cadd(void *a, real *c, int *n)
real hip_glsum(void *a, int *n)
void hip_addcol3(void *a, void *b, void *c, int *n)
void hip_cfill(void *a, real *c, int *n)
void hip_absval(void *a, int *n)
void hip_col2(void *a, void *b, int *n)
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void hip_sub3(void *a, void *b, void *c, int *n)
void hip_add4(void *a, void *b, void *c, void *d, int *n)
void hip_cmult(void *a, real *c, int *n)
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
void hip_add2s1(void *a, void *b, real *c1, int *n)