55 const dim3 nthrds(1024, 1, 1);
56 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
59 fusedcg_update_p_kernel<real>
60 <<<nblcks, nthrds, 0, stream>>>((
real *) p, (
real *) z,
61 (
real *) po, *beta, *n);
68 const dim3 nthrds(1024, 1, 1);
69 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
72 fusedcg_update_x_kernel<real>
73 <<<nblcks, nthrds, 0, stream>>>((
real *)
x, (
const real **) p,
74 (
const real *) alpha, *p_cur, *n);
80 void *alpha_d ,
real *alpha,
int *p_cur,
int * n) {
82 const dim3 nthrds(1024, 1, 1);
83 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
84 const int nb = ((*n) + 1024 - 1)/ 1024;
103 real *alpha_d_p_cur = ((
real *) alpha_d) + ((*p_cur - 1));
105 cudaMemcpyHostToDevice, stream));
108 fusedcg_part2_kernel<real>
109 <<<nblcks, nthrds, 0, stream>>>((
real *) a, (
real *) b,
114 reduce_kernel<real><<<1, 1024, 0, stream>>>(
fusedcg_buf_d, nb);
117 #ifdef HAVE_DEVICE_MPI
118 cudaStreamSynchronize(stream);
123 cudaMemcpyDeviceToHost, stream));
124 cudaStreamSynchronize(stream);
__global__ void const T *__restrict__ x
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
real cuda_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void cuda_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
void cuda_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)