51 int *p_cur,
int *p_space,
int *n) {
53 const dim3 nthrds(1024, 1, 1);
54 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
57 cg_update_xp_kernel<real>
58 <<<nblcks, nthrds, 0, stream>>>((
real *)
x, (
real *) p,
60 (
real *) beta, *p_cur, *p_space, *n);
66 void *u2,
void *
w,
void *z,
67 void *ni,
void *mi,
real *alpha,
68 real *beta,
void *mult,
69 real *reduction,
int *n) {
71 const dim3 nthrds(1024, 1, 1);
72 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
73 const int nb = ((*n) + 1024 - 1)/ 1024;
92 pipecg_vecops_kernel<real>
93 <<<nblcks, nthrds, 0, stream>>>((
real *) p, (
real *) q,
98 *alpha, *beta, (
real *)mult,
103 reduce_kernel<real><<<1, 1024, 0, stream>>>(
buf_d1, nb);
105 reduce_kernel<real><<<1, 1024, 0, stream>>>(
buf_d2, nb);
107 reduce_kernel<real><<<1, 1024, 0, stream>>>(
buf_d3, nb);
111 cudaMemcpyDeviceToHost, stream));
114 cudaMemcpyDeviceToHost, stream));
117 cudaMemcpyDeviceToHost, stream));
122 reduction[0] =
buf[0];
123 reduction[1] =
buf[1];
124 reduction[2] =
buf[2];
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void const T *__restrict__ x
void cuda_pipecg_vecops(void *p, void *q, void *r, void *s, void *u1, void *u2, void *w, void *z, void *ni, void *mi, real *alpha, real *beta, void *mult, real *reduction, int *n)
void cuda_cg_update_xp(void *x, void *p, void *u, void *alpha, void *beta, int *p_cur, int *p_space, int *n)