54 void *xbar,
int *
j,
int *n){
61 const int nt = 1024/pow2;
62 const dim3 glsc3_nthrds(pow2, nt, 1);
63 const dim3 glsc3_nblcks(((*n)+nt - 1)/nt, 1, 1);
64 const int glsc3_nb = ((*n) + nt - 1)/nt;
74 glsc3_many_kernel<real>
75 <<<glsc3_nblcks, glsc3_nthrds, 0, stream>>>((
const real *) b,
80 glsc3_reduce_kernel<real>
84 cudaMemcpyDeviceToDevice, stream));
87 cudaStreamSynchronize(stream);
90 const dim3 vec_nthrds(1024, 1, 1);
91 const dim3 vec_nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
94 project_on_vec_kernel<real>
95 <<<vec_nblcks, vec_nthrds, 0, stream>>>((
real *) xbar,
102 glsc3_many_kernel<real>
103 <<<glsc3_nblcks, glsc3_nthrds, 0, stream>>>((
const real *) b,
108 glsc3_reduce_kernel<real>
112 cudaMemcpyDeviceToDevice, stream));
114 cudaStreamSynchronize(stream);
118 project_on_vec_kernel<real>
119 <<<vec_nblcks, vec_nthrds, 0, stream>>>((
real *) xbar,
123 (
const real *) alpha,
128 void *
w,
void *xm,
int *
j,
int *n,
real *nrm){
136 const int nt = 1024/pow2;
137 const dim3 glsc3_nthrds(pow2, nt, 1);
138 const dim3 glsc3_nblcks(((*n)+nt - 1)/nt, 1, 1);
139 const int glsc3_nb = ((*n) + nt - 1)/nt;
149 glsc3_many_kernel<real>
150 <<<glsc3_nblcks, glsc3_nthrds, 0, stream>>>((
const real *) b,
155 glsc3_reduce_kernel<real>
159 cudaMemcpyDeviceToDevice, stream));
161 cudaStreamSynchronize(stream);
165 sizeof(
real), cudaMemcpyDeviceToHost, stream));
169 const dim3 vec_nthrds(1024, 1, 1);
170 const dim3 vec_nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
173 project_ortho_vec_kernel<real>
174 <<<vec_nblcks, vec_nthrds, 0, stream>>>((
real *) xm, (
const real **) xx,
176 (
const real *) alpha, *
j, *n);
179 glsc3_many_kernel<real>
180 <<<glsc3_nblcks, glsc3_nthrds, 0, stream>>>((
const real *) b,
185 glsc3_reduce_kernel<real>
189 cudaMemcpyDeviceToDevice, stream));
191 cudaStreamSynchronize(stream);
195 project_ortho_vec_kernel<real>
196 <<<vec_nblcks, vec_nthrds, 0, stream>>>((
real *) xm, (
const real **) xx,
198 (
const real *) alpha, *
j, *n);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
void device_mpi_allreduce_inplace(void *buf_d, int count, int nbytes, int op)
void cuda_project_ortho(void *alpha, void *b, void *xx, void *bb, void *w, void *xm, int *j, int *n, real *nrm)
void cuda_project_on(void *alpha, void *b, void *xx, void *bb, void *mult, void *xbar, int *j, int *n)