35 #include <hip/hip_runtime.h>
50 hipMemcpyDeviceToDevice,
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_copy_kernel<real>),
64 (
real *) a, (
real *) b, (
int *) mask, *n, *m);
82 const dim3 nthrds(1024, 1, 1);
83 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
85 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult_kernel<real>),
97 const dim3 nthrds(1024, 1, 1);
98 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
100 hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult2_kernel<real>),
111 const dim3 nthrds(1024, 1, 1);
112 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
114 hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd_kernel<real>),
125 const dim3 nthrds(1024, 1, 1);
126 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
128 hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_kernel<real>),
140 const dim3 nthrds(1024, 1, 1);
141 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
143 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2_kernel<real>),
156 const dim3 nthrds(1024, 1, 1);
157 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
159 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s1_kernel<real>),
173 const dim3 nthrds(1024, 1, 1);
174 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
176 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_kernel<real>),
190 const dim3 nthrds(1024, 1, 1);
191 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
193 hipLaunchKernelGGL(HIP_KERNEL_NAME(addsqr2s2_kernel<real>),
207 const dim3 nthrds(1024, 1, 1);
208 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
210 hipLaunchKernelGGL(HIP_KERNEL_NAME(add3s2_kernel<real>),
223 const dim3 nthrds(1024, 1, 1);
224 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
226 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol1_kernel<real>),
238 const dim3 nthrds(1024, 1, 1);
239 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
241 hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol2_kernel<real>),
253 const dim3 nthrds(1024, 1, 1);
254 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
256 hipLaunchKernelGGL(HIP_KERNEL_NAME(col2_kernel<real>),
268 const dim3 nthrds(1024, 1, 1);
269 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
271 hipLaunchKernelGGL(HIP_KERNEL_NAME(col3_kernel<real>),
283 const dim3 nthrds(1024, 1, 1);
284 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
286 hipLaunchKernelGGL(HIP_KERNEL_NAME(subcol3_kernel<real>),
298 const dim3 nthrds(1024, 1, 1);
299 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
301 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub2_kernel<real>),
313 const dim3 nthrds(1024, 1, 1);
314 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
316 hipLaunchKernelGGL(HIP_KERNEL_NAME(sub3_kernel<real>),
328 const dim3 nthrds(1024, 1, 1);
329 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
331 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol3_kernel<real>),
343 const dim3 nthrds(1024, 1, 1);
344 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
346 hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol4_kernel<real>),
357 void *v1,
void *v2,
void *v3,
int *n) {
359 const dim3 nthrds(1024, 1, 1);
360 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
362 hipLaunchKernelGGL(HIP_KERNEL_NAME(vdot3_kernel<real>),
382 const dim3 nthrds(1024, 1, 1);
383 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
384 const int nb = ((*n) + 1024 - 1)/ 1024;
397 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
398 nblcks, nthrds, 0, stream,
402 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
407 hipMemcpyDeviceToHost, stream));
408 hipStreamSynchronize(stream);
420 const dim3 nthrds(1024, 1, 1);
421 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
422 const int nb = ((*n) + 1024 - 1)/ 1024;
435 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
436 nblcks, nthrds, 0, stream,
440 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
444 #ifdef HAVE_DEVICE_MPI
445 hipStreamSynchronize(stream);
449 hipMemcpyDeviceToHost, stream));
450 hipStreamSynchronize(stream);
464 const int nt = 1024/pow2;
465 const dim3 nthrds(pow2, nt, 1);
466 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
467 const dim3 nthrds_red(1024,1,1);
468 const dim3 nblcks_red( (*
j),1,1);
469 const int nb = ((*n) + nt - 1)/nt;
481 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_many_kernel<real>),
482 nblcks, nthrds, 0, stream,
487 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_reduce_kernel<real>),
488 nblcks_red, nthrds_red, 0, stream,
492 #ifdef HAVE_DEVICE_MPI
493 hipStreamSynchronize(stream);
497 hipMemcpyDeviceToHost, stream));
498 hipStreamSynchronize(stream);
510 const dim3 nthrds(1024, 1, 1);
511 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
513 hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_many_kernel<real>),
526 const dim3 nthrds(1024, 1, 1);
527 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
528 const int nb = ((*n) + 1024 - 1)/ 1024;
541 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc2_kernel<real>),
542 nblcks, nthrds, 0, stream,
545 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
549 #ifdef HAVE_DEVICE_MPI
550 hipStreamSynchronize(stream);
554 hipMemcpyDeviceToHost, stream));
555 hipStreamSynchronize(stream);
565 const dim3 nthrds(1024, 1, 1);
566 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
567 const int nb = ((*n) + 1024 - 1)/ 1024;
580 hipLaunchKernelGGL(HIP_KERNEL_NAME(glsum_kernel<real>),
581 nblcks, nthrds, 0, stream,
584 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
588 #ifdef HAVE_DEVICE_MPI
589 hipStreamSynchronize(stream);
593 hipMemcpyDeviceToHost, stream));
594 hipStreamSynchronize(stream);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void hip_cmult2(void *a, void *b, real *c, int *n)
real hip_glsc3(void *a, void *b, void *c, int *n)
void hip_invcol2(void *a, void *b, int *n)
void hip_invcol1(void *a, int *n)
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void hip_subcol3(void *a, void *b, void *c, int *n)
void hip_col3(void *a, void *b, void *c, int *n)
real hip_glsc2(void *a, void *b, int *n)
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void hip_copy(void *a, void *b, int *n)
void hip_add2(void *a, void *b, int *n)
real hip_vlsc3(void *u, void *v, void *w, int *n)
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
void hip_add2s2(void *a, void *b, real *c1, int *n)
void hip_rzero(void *a, int *n)
void hip_sub2(void *a, void *b, int *n)
void hip_cadd(void *a, real *c, int *n)
real hip_glsum(void *a, int *n)
void hip_addcol3(void *a, void *b, void *c, int *n)
void hip_cfill(void *a, real *c, int *n)
void hip_col2(void *a, void *b, int *n)
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void hip_sub3(void *a, void *b, void *c, int *n)
void hip_cmult(void *a, real *c, int *n)
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
void hip_add2s1(void *a, void *b, real *c1, int *n)