61 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
75 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
110 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
124 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
138 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
153 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
167 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
184 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
199 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
229 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
245 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
262 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
266 (
real *) alpha, *
j, *n);
279 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
296 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
340 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
368 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
383 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
397 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
412 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
427 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
440 void *v1,
void *v2,
void *v3,
int *n) {
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
458 void *v1,
void *v2,
void *v3,
459 void *w1,
void *w2,
void *
w3,
int *n) {
462 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
489 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const int nb = ((*n) + 1024 - 1)/ 1024;
523 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
524 const int nb = ((*n) + 1024 - 1)/ 1024;
543#ifdef HAVE_DEVICE_MPI
564 const int nt = 1024/
pow2;
567 const int nb = ((*n) + nt - 1)/nt;
587#ifdef HAVE_DEVICE_MPI
604 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
605 const int nb = ((*n) + 1024 - 1)/ 1024;
626#ifdef HAVE_DEVICE_MPI
644 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
645 const int nb = ((*n) + 1024 - 1)/ 1024;
664#ifdef HAVE_DEVICE_MPI
683 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
702 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
717 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
732 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
747 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
762 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
777 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
792 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
807 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_absval(void *a, int *n)
void cuda_invcol1(void *a, int *n)
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
void cuda_cadd2(void *a, void *b, real *c, int *n)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n)
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n)
real cuda_vlsc3(void *u, void *v, void *w, int *n)
void cuda_add2s2(void *a, void *b, real *c1, int *n)
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void cuda_add3(void *a, void *b, void *c, int *n)
void cuda_col2(void *a, void *b, int *n)
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void cuda_add4(void *a, void *b, void *c, void *d, int *n)
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n)
void cuda_addcol3(void *a, void *b, void *c, int *n)
void cuda_pwmax_sca2(void *a, real *c, int *n)
void cuda_subcol3(void *a, void *b, void *c, int *n)
void cuda_cmult(void *a, real *c, int *n)
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
void cuda_add2s1(void *a, void *b, real *c1, int *n)
real cuda_glsum(void *a, int *n)
real cuda_glsc2(void *a, void *b, int *n)
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void cuda_rzero(void *a, int *n)
void cuda_pwmin_sca2(void *a, real *c, int *n)
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n)
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
void cuda_add2(void *a, void *b, int *n)
void cuda_copy(void *a, void *b, int *n)
void cuda_pwmax_vec2(void *a, void *b, int *n)
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
void cuda_invcol2(void *a, void *b, int *n)
void cuda_pwmin_vec2(void *a, void *b, int *n)
void cuda_col3(void *a, void *b, void *c, int *n)
void cuda_cfill(void *a, real *c, int *n)
void cuda_cadd(void *a, real *c, int *n)
void cuda_sub2(void *a, void *b, int *n)
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
real cuda_glsc3(void *a, void *b, void *c, int *n)
void cuda_cmult2(void *a, void *b, real *c, int *n)
void cuda_sub3(void *a, void *b, void *c, int *n)