54 const dim3 nthrds(1024, 1, 1);
55 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
56 const int nb = ((*n) + 1024 - 1)/ 1024;
71 gmres_part2_kernel<real>
72 <<<nblcks, nthrds, 0, stream>>>((
real *)
w, (
real **)
v,
76 reduce_kernel<real><<<1, 1024, 0, stream>>>(
gmres_bfd1, nb);
79 #ifdef HAVE_DEVICE_MPI
80 cudaStreamSynchronize(stream);
85 cudaMemcpyDeviceToHost, stream));
86 cudaStreamSynchronize(stream);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
real cuda_gmres_part2(void *w, void *v, void *h, void *mult, int *j, int *n)