51 cudaMemcpyDeviceToDevice,
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
63 masked_copy_kernel<real><<<nblcks, nthrds, 0,
83 const dim3 nthrds(1024, 1, 1);
84 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
86 cmult_kernel<real><<<nblcks, nthrds, 0,
97 const dim3 nthrds(1024, 1, 1);
98 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
100 cmult2_kernel<real><<<nblcks, nthrds, 0,
111 const dim3 nthrds(1024, 1, 1);
112 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
114 cadd_kernel<real><<<nblcks, nthrds, 0,
126 const dim3 nthrds(1024, 1, 1);
127 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
129 cfill_kernel<real><<<nblcks, nthrds, 0,
141 const dim3 nthrds(1024, 1, 1);
142 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
144 add2_kernel<real><<<nblcks, nthrds, 0,
157 const dim3 nthrds(1024, 1, 1);
158 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
160 add2s1_kernel<real><<<nblcks, nthrds, 0,
173 const dim3 nthrds(1024, 1, 1);
174 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
176 add2s2_kernel<real><<<nblcks, nthrds, 0,
190 const dim3 nthrds(1024, 1, 1);
191 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
193 add2s2_many_kernel<real><<<nblcks, nthrds, 0,
195 (
real *) alpha, *
j, *n);
207 const dim3 nthrds(1024, 1, 1);
208 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
210 addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
224 const dim3 nthrds(1024, 1, 1);
225 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
227 add3s2_kernel<real><<<nblcks, nthrds, 0,
240 const dim3 nthrds(1024, 1, 1);
241 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
243 invcol1_kernel<real><<<nblcks, nthrds, 0,
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
257 invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
268 const dim3 nthrds(1024, 1, 1);
269 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
271 col2_kernel<real><<<nblcks, nthrds, 0,
282 const dim3 nthrds(1024, 1, 1);
283 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
285 col3_kernel<real><<<nblcks, nthrds, 0,
296 const dim3 nthrds(1024, 1, 1);
297 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
299 subcol3_kernel<real><<<nblcks, nthrds, 0,
311 const dim3 nthrds(1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
314 sub2_kernel<real><<<nblcks, nthrds, 0,
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
328 sub3_kernel<real><<<nblcks, nthrds, 0,
340 const dim3 nthrds(1024, 1, 1);
341 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
343 addcol3_kernel<real><<<nblcks, nthrds, 0,
355 const dim3 nthrds(1024, 1, 1);
356 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
358 addcol4_kernel<real><<<nblcks, nthrds, 0,
369 void *v1,
void *v2,
void *v3,
int *n) {
371 const dim3 nthrds(1024, 1, 1);
372 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
374 vdot3_kernel<real><<<nblcks, nthrds, 0,
395 const dim3 nthrds(1024, 1, 1);
396 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
397 const int nb = ((*n) + 1024 - 1)/ 1024;
410 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
413 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
417 cudaMemcpyDeviceToHost, stream));
418 cudaStreamSynchronize(stream);
429 const dim3 nthrds(1024, 1, 1);
430 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
431 const int nb = ((*n) + 1024 - 1)/ 1024;
444 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
447 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
450 #ifdef HAVE_DEVICE_MPI
451 cudaStreamSynchronize(stream);
455 cudaMemcpyDeviceToHost, stream));
456 cudaStreamSynchronize(stream);
471 const int nt = 1024/pow2;
472 const dim3 nthrds(pow2, nt, 1);
473 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
474 const int nb = ((*n) + nt - 1)/nt;
487 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
491 glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(
bufred_d, nb, *
j);
494 #ifdef HAVE_DEVICE_MPI
495 cudaStreamSynchronize(stream);
499 cudaMemcpyDeviceToHost, stream));
500 cudaStreamSynchronize(stream);
510 const dim3 nthrds(1024, 1, 1);
511 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
512 const int nb = ((*n) + 1024 - 1)/ 1024;
526 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
530 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
533 #ifdef HAVE_DEVICE_MPI
534 cudaStreamSynchronize(stream);
538 cudaMemcpyDeviceToHost, stream));
539 cudaStreamSynchronize(stream);
550 const dim3 nthrds(1024, 1, 1);
551 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
552 const int nb = ((*n) + 1024 - 1)/ 1024;
566 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
bufred_d, *n);
568 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
571 #ifdef HAVE_DEVICE_MPI
572 cudaStreamSynchronize(stream);
576 cudaMemcpyDeviceToHost, stream));
577 cudaStreamSynchronize(stream);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_invcol1(void *a, int *n)
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
real cuda_vlsc3(void *u, void *v, void *w, int *n)
void cuda_add2s2(void *a, void *b, real *c1, int *n)
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_col2(void *a, void *b, int *n)
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void cuda_addcol3(void *a, void *b, void *c, int *n)
void cuda_subcol3(void *a, void *b, void *c, int *n)
void cuda_cmult(void *a, real *c, int *n)
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
void cuda_add2s1(void *a, void *b, real *c1, int *n)
real cuda_glsum(void *a, int *n)
real cuda_glsc2(void *a, void *b, int *n)
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void cuda_rzero(void *a, int *n)
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
void cuda_add2(void *a, void *b, int *n)
void cuda_copy(void *a, void *b, int *n)
void cuda_invcol2(void *a, void *b, int *n)
void cuda_col3(void *a, void *b, void *c, int *n)
void cuda_cfill(void *a, real *c, int *n)
void cuda_cadd(void *a, real *c, int *n)
void cuda_sub2(void *a, void *b, int *n)
real cuda_glsc3(void *a, void *b, void *c, int *n)
void cuda_cmult2(void *a, void *b, real *c, int *n)
void cuda_sub3(void *a, void *b, void *c, int *n)