65 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
66 const int nb = ((*n) + 1024 - 1)/ 1024;
104 if (
sizeof(
real) ==
sizeof(
float)) {
109 else if (
sizeof(
real) ==
sizeof(
double)) {
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
real cuda_gmres_part2(void *w, void *v, void *h, void *mult, int *j, int *n)