52 void *
u,
int *n,
void *gd,
int *nb,
53 void *b,
void *bo,
int *op,
54 cudaStream_t stream) {
56 if ((*m) == 0)
return;
58 const dim3 nthrds(1024, 1, 1);
59 const dim3 nblcks(((*m)+ 1024 - 1)/ 1024, 1, 1);
63 gather_kernel_add<real>
64 <<<nblcks, nthrds, 0, stream>>>((
real *)
v, *m, *o, (
int *) dg,
65 (
real *)
u, *n, (
int *) gd,
66 *nb, (
int *) b, (
int *) bo);
70 gather_kernel_mul<real>
71 <<<nblcks, nthrds, 0, stream>>>((
real *)
v, *m, *o, (
int *) dg,
72 (
real *)
u, *n, (
int *) gd,
73 *nb, (
int *) b, (
int *) bo);
77 gather_kernel_min<real>
78 <<<nblcks, nthrds, 0, stream>>>((
real *)
v, *m, *o, (
int *) dg,
79 (
real *)
u, *n, (
int *) gd,
80 *nb, (
int *) b, (
int *) bo);
84 gather_kernel_max<real>
85 <<<nblcks, nthrds, 0, stream>>>((
real *)
v, *m, *o, (
int *) dg,
86 (
real *)
u, *n, (
int *) gd,
87 *nb, (
int *) b, (
int *) bo);
97 void *
u,
int *n,
void *gd,
98 int *nb,
void *b,
void *bo,
99 cudaStream_t stream) {
101 if ((*m) == 0)
return;
103 const dim3 nthrds(1024, 1, 1);
104 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
107 <<<nblcks, nthrds, 0, stream>>>((
real *)
v, *m, (
int *) dg,
108 (
real *)
u, *n, (
int *) gd,
109 *nb, (
int *) b, (
int *) bo);
117 int offset,
int n, cudaStream_t stream) {
119 const int nthrds = 1024;
120 const int nblcks = (n + nthrds - 1) / nthrds;
122 if (stream == NULL) {
124 <<<nblcks, nthrds>>>((
real *) u_d, (
real *) buf_d + offset,
125 (
int *) dof_d + offset, n);
129 <<<nblcks, nthrds, 0, stream>>>((
real *) u_d, (
real *) buf_d + offset,
130 (
int *) dof_d + offset, n);
140 int offset,
int n, cudaStream_t stream) {
142 const int nthrds = 1024;
143 const int nblcks = (n + nthrds - 1) / nthrds;
147 if (stream == NULL) {
148 gs_unpack_add_kernel<real>
149 <<<nblcks, nthrds>>>(u_d, buf_d + offset, dof_d + offset, n);
152 gs_unpack_add_kernel<real>
153 <<<nblcks, nthrds, 0, stream>>>(u_d, buf_d + offset,
158 printf(
"%s: unknown gs op %d\n", __FILE__, op);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)