51 cudaMemcpyDeviceToDevice,
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
63 masked_copy_kernel<real><<<nblcks, nthrds, 0,
74 const dim3 nthrds(1024, 1, 1);
75 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
77 cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>(
78 (
real*)a, *c, *size, mask, *mask_size);
95 const dim3 nthrds(1024, 1, 1);
96 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
98 cmult_kernel<real><<<nblcks, nthrds, 0,
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
112 cmult2_kernel<real><<<nblcks, nthrds, 0,
123 const dim3 nthrds(1024, 1, 1);
124 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
126 cadd_kernel<real><<<nblcks, nthrds, 0,
138 const dim3 nthrds(1024, 1, 1);
139 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
141 cadd2_kernel<real><<<nblcks, nthrds, 0,
152 const dim3 nthrds(1024, 1, 1);
153 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
155 cfill_kernel<real><<<nblcks, nthrds, 0,
167 const dim3 nthrds(1024, 1, 1);
168 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
170 add2_kernel<real><<<nblcks, nthrds, 0,
182 const dim3 nthrds(1024, 1, 1);
183 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
185 add3_kernel<real><<<nblcks, nthrds, 0,
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
201 add2s1_kernel<real><<<nblcks, nthrds, 0,
214 const dim3 nthrds(1024, 1, 1);
215 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
217 add2s2_kernel<real><<<nblcks, nthrds, 0,
231 const dim3 nthrds(1024, 1, 1);
232 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
234 add2s2_many_kernel<real><<<nblcks, nthrds, 0,
236 (
real *) alpha, *
j, *n);
248 const dim3 nthrds(1024, 1, 1);
249 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
251 addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
265 const dim3 nthrds(1024, 1, 1);
266 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
268 add3s2_kernel<real><<<nblcks, nthrds, 0,
281 const dim3 nthrds(1024, 1, 1);
282 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
284 invcol1_kernel<real><<<nblcks, nthrds, 0,
295 const dim3 nthrds(1024, 1, 1);
296 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
298 invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
309 const dim3 nthrds(1024, 1, 1);
310 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
312 col2_kernel<real><<<nblcks, nthrds, 0,
323 const dim3 nthrds(1024, 1, 1);
324 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
326 col3_kernel<real><<<nblcks, nthrds, 0,
337 const dim3 nthrds(1024, 1, 1);
338 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
340 subcol3_kernel<real><<<nblcks, nthrds, 0,
352 const dim3 nthrds(1024, 1, 1);
353 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355 sub2_kernel<real><<<nblcks, nthrds, 0,
366 const dim3 nthrds(1024, 1, 1);
367 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
369 sub3_kernel<real><<<nblcks, nthrds, 0,
381 const dim3 nthrds(1024, 1, 1);
382 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
384 addcol3_kernel<real><<<nblcks, nthrds, 0,
396 const dim3 nthrds(1024, 1, 1);
397 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
399 addcol4_kernel<real><<<nblcks, nthrds, 0,
410 void *v1,
void *v2,
void *v3,
int *n) {
412 const dim3 nthrds(1024, 1, 1);
413 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
415 vdot3_kernel<real><<<nblcks, nthrds, 0,
436 const dim3 nthrds(1024, 1, 1);
437 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
438 const int nb = ((*n) + 1024 - 1)/ 1024;
451 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
454 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
458 cudaMemcpyDeviceToHost, stream));
459 cudaStreamSynchronize(stream);
470 const dim3 nthrds(1024, 1, 1);
471 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
472 const int nb = ((*n) + 1024 - 1)/ 1024;
485 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
488 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
491 #ifdef HAVE_DEVICE_MPI
492 cudaStreamSynchronize(stream);
496 cudaMemcpyDeviceToHost, stream));
497 cudaStreamSynchronize(stream);
512 const int nt = 1024/pow2;
513 const dim3 nthrds(pow2, nt, 1);
514 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
515 const int nb = ((*n) + nt - 1)/nt;
528 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
532 glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(
bufred_d, nb, *
j);
535 #ifdef HAVE_DEVICE_MPI
536 cudaStreamSynchronize(stream);
540 cudaMemcpyDeviceToHost, stream));
541 cudaStreamSynchronize(stream);
551 const dim3 nthrds(1024, 1, 1);
552 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
553 const int nb = ((*n) + 1024 - 1)/ 1024;
567 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
571 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
574 #ifdef HAVE_DEVICE_MPI
575 cudaStreamSynchronize(stream);
579 cudaMemcpyDeviceToHost, stream));
580 cudaStreamSynchronize(stream);
591 const dim3 nthrds(1024, 1, 1);
592 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
593 const int nb = ((*n) + 1024 - 1)/ 1024;
607 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
bufred_d, *n);
609 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
612 #ifdef HAVE_DEVICE_MPI
613 cudaStreamSynchronize(stream);
617 cudaMemcpyDeviceToHost, stream));
618 cudaStreamSynchronize(stream);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_invcol1(void *a, int *n)
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void cuda_cadd2(void *a, void *b, real *c, int *n)
real cuda_vlsc3(void *u, void *v, void *w, int *n)
void cuda_add2s2(void *a, void *b, real *c1, int *n)
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_add3(void *a, void *b, void *c, int *n)
void cuda_col2(void *a, void *b, int *n)
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void cuda_addcol3(void *a, void *b, void *c, int *n)
void cuda_subcol3(void *a, void *b, void *c, int *n)
void cuda_cmult(void *a, real *c, int *n)
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
void cuda_add2s1(void *a, void *b, real *c1, int *n)
real cuda_glsum(void *a, int *n)
real cuda_glsc2(void *a, void *b, int *n)
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void cuda_rzero(void *a, int *n)
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
void cuda_add2(void *a, void *b, int *n)
void cuda_copy(void *a, void *b, int *n)
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
void cuda_invcol2(void *a, void *b, int *n)
void cuda_col3(void *a, void *b, void *c, int *n)
void cuda_cfill(void *a, real *c, int *n)
void cuda_cadd(void *a, real *c, int *n)
void cuda_sub2(void *a, void *b, int *n)
real cuda_glsc3(void *a, void *b, void *c, int *n)
void cuda_cmult2(void *a, void *b, real *c, int *n)
void cuda_sub3(void *a, void *b, void *c, int *n)