35 #include <hip/hip_runtime.h>
52 int *p_cur,
int *p_space,
int *n) {
54 const dim3 nthrds(1024, 1, 1);
55 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
57 hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_update_xp_kernel<real>),
60 (
real *) beta, *p_cur, *p_space, *n);
65 void *u2,
void *
w,
void *z,
66 void *ni,
void *mi,
real *alpha,
67 real *beta,
void *mult,
68 real *reduction,
int *n) {
70 const dim3 nthrds(1024, 1, 1);
71 const dim3 nblcks(((*n) + 1024 - 1)/ 1024, 1, 1);
72 const int nb = ((*n) + 1024 - 1)/ 1024;
91 hipLaunchKernelGGL(HIP_KERNEL_NAME(pipecg_vecops_kernel<real>),
92 nblcks, nthrds, 0, stream,
95 (
real *) ni, (
real *) mi, *alpha, *beta,
99 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
100 1, 1024, 0 , stream,
buf_d1, nb);
102 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
103 1, 1024, 0 , stream,
buf_d2, nb);
105 hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
106 1, 1024, 0 , stream,
buf_d3, nb);
110 hipMemcpyDeviceToHost, stream));
113 hipMemcpyDeviceToHost, stream));
116 hipMemcpyDeviceToHost, stream));
121 reduction[0] =
buf[0];
122 reduction[1] =
buf[1];
123 reduction[2] =
buf[2];
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void const T *__restrict__ x
void hip_cg_update_xp(void *x, void *p, void *u, void *alpha, void *beta, int *p_cur, int *p_space, int *n)
void hip_pipecg_vecops(void *p, void *q, void *r, void *s, void *u1, void *u2, void *w, void *z, void *ni, void *mi, real *alpha, real *beta, void *mult, real *reduction, int *n)