54 void *xbar,
int *
j,
int *n){
61 const int nt = 1024/
pow2;
64 const int glsc3_nb = ((*n) + nt - 1)/nt;
123 (
const real *) alpha,
136 const int nt = 1024/
pow2;
139 const int glsc3_nb = ((*n) + nt - 1)/nt;
176 (
const real *) alpha, *
j, *n);
198 (
const real *) alpha, *
j, *n);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce_inplace(void *buf_d, int count, int nbytes, int op)
void cuda_project_ortho(void *alpha, void *b, void *xx, void *bb, void *w, void *xm, int *j, int *n, real *nrm)
void cuda_project_on(void *alpha, void *b, void *xx, void *bb, void *mult, void *xbar, int *j, int *n)