51 cudaMemcpyDeviceToDevice,
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
63 masked_copy_kernel<real><<<nblcks, nthrds, 0,
74 const dim3 nthrds(1024, 1, 1);
75 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
77 masked_red_copy_kernel<real><<<nblcks, nthrds, 0,
88 const dim3 nthrds(1024, 1, 1);
89 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
91 cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>(
92 (
real*)a, *c, *size, mask, *mask_size);
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
112 cmult_kernel<real><<<nblcks, nthrds, 0,
123 const dim3 nthrds(1024, 1, 1);
124 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
126 cmult2_kernel<real><<<nblcks, nthrds, 0,
137 const dim3 nthrds(1024, 1, 1);
138 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
140 cadd_kernel<real><<<nblcks, nthrds, 0,
152 const dim3 nthrds(1024, 1, 1);
153 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
155 cadd2_kernel<real><<<nblcks, nthrds, 0,
166 const dim3 nthrds(1024, 1, 1);
167 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
170 cfill_kernel<real><<<nblcks, nthrds, 0,
183 const dim3 nthrds(1024, 1, 1);
184 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
186 add2_kernel<real><<<nblcks, nthrds, 0,
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
201 add3_kernel<real><<<nblcks, nthrds, 0,
211 void cuda_add4(
void *a,
void *b,
void *c,
void *d,
int *n) {
213 const dim3 nthrds(1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
216 add4_kernel<real><<<nblcks, nthrds, 0,
228 const dim3 nthrds(1024, 1, 1);
229 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
231 add2s1_kernel<real><<<nblcks, nthrds, 0,
244 const dim3 nthrds(1024, 1, 1);
245 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
247 add2s2_kernel<real><<<nblcks, nthrds, 0,
261 const dim3 nthrds(1024, 1, 1);
262 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
264 add2s2_many_kernel<real><<<nblcks, nthrds, 0,
266 (
real *) alpha, *
j, *n);
278 const dim3 nthrds(1024, 1, 1);
279 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
281 addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
295 const dim3 nthrds(1024, 1, 1);
296 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
298 add3s2_kernel<real><<<nblcks, nthrds, 0,
311 const dim3 nthrds(1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
314 invcol1_kernel<real><<<nblcks, nthrds, 0,
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
328 invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)
glb_cmd_queue>>>((
real *) a,
339 const dim3 nthrds(1024, 1, 1);
340 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
342 col2_kernel<real><<<nblcks, nthrds, 0,
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
356 col3_kernel<real><<<nblcks, nthrds, 0,
367 const dim3 nthrds(1024, 1, 1);
368 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
370 subcol3_kernel<real><<<nblcks, nthrds, 0,
382 const dim3 nthrds(1024, 1, 1);
383 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
385 sub2_kernel<real><<<nblcks, nthrds, 0,
396 const dim3 nthrds(1024, 1, 1);
397 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
399 sub3_kernel<real><<<nblcks, nthrds, 0,
411 const dim3 nthrds(1024, 1, 1);
412 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
414 addcol3_kernel<real><<<nblcks, nthrds, 0,
426 const dim3 nthrds(1024, 1, 1);
427 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
429 addcol4_kernel<real><<<nblcks, nthrds, 0,
440 void *v1,
void *v2,
void *v3,
int *n) {
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
445 vdot3_kernel<real><<<nblcks, nthrds, 0,
458 void *v1,
void *v2,
void *v3,
459 void *w1,
void *w2,
void *
w3,
int *n) {
461 const dim3 nthrds(1024, 1, 1);
462 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
464 vcross_kernel<real><<<nblcks, nthrds, 0,
488 const dim3 nthrds(1024, 1, 1);
489 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const int nb = ((*n) + 1024 - 1)/ 1024;
503 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
506 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
510 cudaMemcpyDeviceToHost, stream));
511 cudaStreamSynchronize(stream);
522 const dim3 nthrds(1024, 1, 1);
523 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
524 const int nb = ((*n) + 1024 - 1)/ 1024;
537 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
540 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
543 #ifdef HAVE_DEVICE_MPI
544 cudaStreamSynchronize(stream);
548 cudaMemcpyDeviceToHost, stream));
549 cudaStreamSynchronize(stream);
564 const int nt = 1024/pow2;
565 const dim3 nthrds(pow2, nt, 1);
566 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
567 const int nb = ((*n) + nt - 1)/nt;
580 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
584 glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(
bufred_d, nb, *
j);
587 #ifdef HAVE_DEVICE_MPI
588 cudaStreamSynchronize(stream);
592 cudaMemcpyDeviceToHost, stream));
593 cudaStreamSynchronize(stream);
603 const dim3 nthrds(1024, 1, 1);
604 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
605 const int nb = ((*n) + 1024 - 1)/ 1024;
619 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
623 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
626 #ifdef HAVE_DEVICE_MPI
627 cudaStreamSynchronize(stream);
631 cudaMemcpyDeviceToHost, stream));
632 cudaStreamSynchronize(stream);
643 const dim3 nthrds(1024, 1, 1);
644 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
645 const int nb = ((*n) + 1024 - 1)/ 1024;
659 <<<nblcks, nthrds, 0, stream>>>((
real *) a,
bufred_d, *n);
661 reduce_kernel<real><<<1, 1024, 0, stream>>> (
bufred_d, nb);
664 #ifdef HAVE_DEVICE_MPI
665 cudaStreamSynchronize(stream);
669 cudaMemcpyDeviceToHost, stream));
670 cudaStreamSynchronize(stream);
682 const dim3 nthrds(1024, 1, 1);
683 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
687 <<<nblcks, nthrds,0, stream>>>((
real *) a, * n);
701 const dim3 nthrds(1024, 1, 1);
702 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
705 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
716 const dim3 nthrds(1024, 1, 1);
717 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
720 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
731 const dim3 nthrds(1024, 1, 1);
732 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
735 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
746 const dim3 nthrds(1024, 1, 1);
747 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
750 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
761 const dim3 nthrds(1024, 1, 1);
762 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
765 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
776 const dim3 nthrds(1024, 1, 1);
777 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
780 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
791 const dim3 nthrds(1024, 1, 1);
792 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
795 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
806 const dim3 nthrds(1024, 1, 1);
807 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
810 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_absval(void *a, int *n)
void cuda_invcol1(void *a, int *n)
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void cuda_cadd2(void *a, void *b, real *c, int *n)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n)
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n)
real cuda_vlsc3(void *u, void *v, void *w, int *n)
void cuda_add2s2(void *a, void *b, real *c1, int *n)
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_add3(void *a, void *b, void *c, int *n)
void cuda_col2(void *a, void *b, int *n)
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void cuda_add4(void *a, void *b, void *c, void *d, int *n)
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n)
void cuda_addcol3(void *a, void *b, void *c, int *n)
void cuda_pwmax_sca2(void *a, real *c, int *n)
void cuda_subcol3(void *a, void *b, void *c, int *n)
void cuda_cmult(void *a, real *c, int *n)
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
void cuda_add2s1(void *a, void *b, real *c1, int *n)
real cuda_glsum(void *a, int *n)
real cuda_glsc2(void *a, void *b, int *n)
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void cuda_rzero(void *a, int *n)
void cuda_pwmin_sca2(void *a, real *c, int *n)
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n)
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
void cuda_add2(void *a, void *b, int *n)
void cuda_copy(void *a, void *b, int *n)
void cuda_pwmax_vec2(void *a, void *b, int *n)
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
void cuda_invcol2(void *a, void *b, int *n)
void cuda_pwmin_vec2(void *a, void *b, int *n)
void cuda_col3(void *a, void *b, void *c, int *n)
void cuda_cfill(void *a, real *c, int *n)
void cuda_cadd(void *a, real *c, int *n)
void cuda_sub2(void *a, void *b, int *n)
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
real cuda_glsc3(void *a, void *b, void *c, int *n)
void cuda_cmult2(void *a, void *b, real *c, int *n)
void cuda_sub3(void *a, void *b, void *c, int *n)