35 #include <hip/hip_runtime.h>
56 const dim3 nthrds(1024, 1, 1);
57 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
60 hipLaunchKernelGGL(HIP_KERNEL_NAME(fusedcg_update_p_kernel<real>),
61 nblcks, nthrds, 0, stream,
69 const dim3 nthrds(1024, 1, 1);
70 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
73 hipLaunchKernelGGL(HIP_KERNEL_NAME(fusedcg_update_x_kernel<real>),
74 nblcks, nthrds, 0, stream,
76 (
const real *) alpha, *p_cur, *n);
82 void *alpha_d ,
real *alpha,
int *p_cur,
int * n) {
84 const dim3 nthrds(1024, 1, 1);
85 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
86 const int nb = ((*n) + 1024 - 1)/ 1024;
105 real *alpha_d_p_cur = ((
real *) alpha_d) + ((*p_cur - 1));
107 hipMemcpyHostToDevice, stream));
109 hipLaunchKernelGGL(HIP_KERNEL_NAME(fusedcg_part2_kernel<real>),
110 nblcks, nthrds, 0, stream,
115 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
119 #ifdef HAVE_DEVICE_MPI
125 hipMemcpyDeviceToHost, stream));
__global__ void const T *__restrict__ x
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)