35 #include <hip/hip_runtime.h>
50 hipMemcpyDeviceToDevice,
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_copy_kernel<real>),
64 (
real *) a, (
real *) b, (
int *) mask, *n, *m);
75 const dim3 nthrds(1024, 1, 1);
76 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
78 hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_mask_kernel<real>),
80 (
real*)a, *c, *size, (
int*)mask, *mask_size);
98 const dim3 nthrds(1024, 1, 1);
99 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
101 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult_kernel<real>),
113 const dim3 nthrds(1024, 1, 1);
114 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
116 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult2_kernel<real>),
127 const dim3 nthrds(1024, 1, 1);
128 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
130 hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd_kernel<real>),
142 const dim3 nthrds(1024, 1, 1);
143 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
145 hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd2_kernel<real>),
156 const dim3 nthrds(1024, 1, 1);
157 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
159 hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_kernel<real>),
171 const dim3 nthrds(1024, 1, 1);
172 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
174 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2_kernel<real>),
187 const dim3 nthrds(1024, 1, 1);
188 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
190 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s1_kernel<real>),
204 const dim3 nthrds(1024, 1, 1);
205 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
207 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_kernel<real>),
221 const dim3 nthrds(1024, 1, 1);
222 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
224 hipLaunchKernelGGL(HIP_KERNEL_NAME(addsqr2s2_kernel<real>),
238 const dim3 nthrds(1024, 1, 1);
239 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
241 hipLaunchKernelGGL(HIP_KERNEL_NAME(add3s2_kernel<real>),
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
257 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol1_kernel<real>),
269 const dim3 nthrds(1024, 1, 1);
270 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
272 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol2_kernel<real>),
284 const dim3 nthrds(1024, 1, 1);
285 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
287 hipLaunchKernelGGL(HIP_KERNEL_NAME(col2_kernel<real>),
299 const dim3 nthrds(1024, 1, 1);
300 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
302 hipLaunchKernelGGL(HIP_KERNEL_NAME(col3_kernel<real>),
314 const dim3 nthrds(1024, 1, 1);
315 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
317 hipLaunchKernelGGL(HIP_KERNEL_NAME(subcol3_kernel<real>),
329 const dim3 nthrds(1024, 1, 1);
330 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
332 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub2_kernel<real>),
344 const dim3 nthrds(1024, 1, 1);
345 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
347 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub3_kernel<real>),
359 const dim3 nthrds(1024, 1, 1);
360 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
362 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol3_kernel<real>),
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
377 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol4_kernel<real>),
388 void *v1,
void *v2,
void *v3,
int *n) {
390 const dim3 nthrds(1024, 1, 1);
391 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
393 hipLaunchKernelGGL(HIP_KERNEL_NAME(vdot3_kernel<real>),
413 const dim3 nthrds(1024, 1, 1);
414 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
415 const int nb = ((*n) + 1024 - 1)/ 1024;
428 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
429 nblcks, nthrds, 0, stream,
433 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
438 hipMemcpyDeviceToHost, stream));
451 const dim3 nthrds(1024, 1, 1);
452 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
453 const int nb = ((*n) + 1024 - 1)/ 1024;
466 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
467 nblcks, nthrds, 0, stream,
471 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
475 #ifdef HAVE_DEVICE_MPI
480 hipMemcpyDeviceToHost, stream));
495 const int nt = 1024/pow2;
496 const dim3 nthrds(pow2, nt, 1);
497 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
498 const dim3 nthrds_red(1024,1,1);
499 const dim3 nblcks_red( (*
j),1,1);
500 const int nb = ((*n) + nt - 1)/nt;
512 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_many_kernel<real>),
513 nblcks, nthrds, 0, stream,
518 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_reduce_kernel<real>),
519 nblcks_red, nthrds_red, 0, stream,
523 #ifdef HAVE_DEVICE_MPI
528 hipMemcpyDeviceToHost, stream));
541 const dim3 nthrds(1024, 1, 1);
542 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
544 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_many_kernel<real>),
557 const dim3 nthrds(1024, 1, 1);
558 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
560 hipLaunchKernelGGL(HIP_KERNEL_NAME(add3_kernel<real>),
572 const dim3 nthrds(1024, 1, 1);
573 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
574 const int nb = ((*n) + 1024 - 1)/ 1024;
587 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc2_kernel<real>),
588 nblcks, nthrds, 0, stream,
591 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
595 #ifdef HAVE_DEVICE_MPI
600 hipMemcpyDeviceToHost, stream));
611 const dim3 nthrds(1024, 1, 1);
612 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
613 const int nb = ((*n) + 1024 - 1)/ 1024;
626 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsum_kernel<real>),
627 nblcks, nthrds, 0, stream,
630 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
634 #ifdef HAVE_DEVICE_MPI
639 hipMemcpyDeviceToHost, stream));
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void hip_cmult2(void *a, void *b, real *c, int *n)
real hip_glsc3(void *a, void *b, void *c, int *n)
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size)
void hip_invcol2(void *a, void *b, int *n)
void hip_cadd2(void *a, void *b, real *c, int *n)
void hip_invcol1(void *a, int *n)
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void hip_subcol3(void *a, void *b, void *c, int *n)
void hip_col3(void *a, void *b, void *c, int *n)
real hip_glsc2(void *a, void *b, int *n)
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void hip_copy(void *a, void *b, int *n)
void hip_add2(void *a, void *b, int *n)
real hip_vlsc3(void *u, void *v, void *w, int *n)
void hip_add3(void *a, void *b, void *c, int *n)
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
void hip_add2s2(void *a, void *b, real *c1, int *n)
void hip_rzero(void *a, int *n)
void hip_sub2(void *a, void *b, int *n)
void hip_cadd(void *a, real *c, int *n)
real hip_glsum(void *a, int *n)
void hip_addcol3(void *a, void *b, void *c, int *n)
void hip_cfill(void *a, real *c, int *n)
void hip_col2(void *a, void *b, int *n)
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void hip_sub3(void *a, void *b, void *c, int *n)
void hip_cmult(void *a, real *c, int *n)
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
void hip_add2s1(void *a, void *b, real *c1, int *n)