72 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
88 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
103 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
118 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
130 void *facet,
int *n1,
int *n2,
int *lx,
131 int *ly,
int *lz,
int *m,
135 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
138 ((
real *)
a, (
real *) b, (
int *)
mask, (
int *) facet, *n1, *n2, *lx,
151 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
165 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
179 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
194 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
227 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
241 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
254 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
268 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
282 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
298 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
328 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
358 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
373 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
389 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
407 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
423 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
440 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
457 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
474 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
503 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
517 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
531 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
544 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
558 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
573 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
587 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
601 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
616 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
631 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
643 void *v1,
void *v2,
void *v3,
int *n,
647 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
660 void *v1,
void *v2,
void *v3,
661 void *w1,
void *w2,
void *
w3,
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
718 if (
sizeof(
real) ==
sizeof(
float)) {
723 else if (
sizeof(
real) ==
sizeof(
double)) {
755 if (
sizeof(
real) ==
sizeof(
float)) {
760 else if (
sizeof(
real) ==
sizeof(
double)) {
792 if (
sizeof(
real) ==
sizeof(
float)) {
797 else if (
sizeof(
real) ==
sizeof(
double)) {
824 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
825 const int nb = ((*n) + 1024 - 1)/ 1024;
852 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
853 const int nb = ((*n) + 1024 - 1)/ 1024;
882 const int nt = 1024/
pow2;
885 const int nb = ((*n) + nt - 1)/nt;
911 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
912 const int nb = ((*n) + 1024 - 1)/ 1024;
940 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
941 const int nb = ((*n) + 1024 - 1)/ 1024;
968 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
969 const int nb = ((*n) + 1024 - 1)/ 1024;
995 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
996 const int nb = ((*n) + 1024 - 1)/ 1024;
1023 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1024 const int nb = ((*n) + 1024 - 1)/ 1024;
1052 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1070 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1084 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1098 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1101 ((
real *)
a, *c, *n);
1112 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1126 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1140 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1154 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1167 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1182 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1185 ((
int *)
a, *c, *n);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void cuda_global_reduce_min(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
void cuda_add2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
real cuda_glsubnorm2(void *a, void *b, int *n, cudaStream_t stream)
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cudaStream_t strm)
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cudaStream_t strm)
void cuda_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cudaStream_t strm)
void cuda_global_reduce_add(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
real cuda_glsc2(void *a, void *b, int *n, cudaStream_t stream)
void cuda_addcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_pwmin_vec2(void *a, void *b, int *n, cudaStream_t stream)
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
real cuda_glmin(void *a, real *pinf, int *n, cudaStream_t stream)
void cuda_cmult2(void *a, void *b, real *c, int *n, cudaStream_t strm)
void cuda_add2s1(void *a, void *b, real *c1, int *n, cudaStream_t strm)
void cuda_col3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, cudaStream_t strm)
void cuda_sub3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_copy(void *a, void *b, int *n, cudaStream_t strm)
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size, cudaStream_t strm)
void cuda_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_invcol2(void *a, void *b, int *n, cudaStream_t strm)
void cuda_pwmax_sca2(void *a, real *c, int *n, cudaStream_t stream)
void cuda_col2(void *a, void *b, int *n, cudaStream_t strm)
void cuda_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_add2(void *a, void *b, int *n, cudaStream_t strm)
void cuda_redbuf_check_alloc(int nb)
void cuda_cmult(void *a, real *c, int *n, cudaStream_t strm)
void cuda_add4(void *a, void *b, void *c, void *d, int *n, cudaStream_t strm)
void cuda_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_sub2(void *a, void *b, int *n, cudaStream_t strm)
void cuda_invcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cudaStream_t strm)
void cuda_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
real cuda_glsum(void *a, int *n, cudaStream_t stream)
real cuda_glsc3(void *a, void *b, void *c, int *n, cudaStream_t stream)
real cuda_vlsc3(void *u, void *v, void *w, int *n, cudaStream_t stream)
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n, cudaStream_t stream)
void cuda_cadd2(void *a, void *b, real *c, int *n, cudaStream_t strm)
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n, cudaStream_t strm)
void cuda_add3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_addcol3s2(void *a, void *b, void *c, real *s, int *n, cudaStream_t strm)
void cuda_cwrap(void *a, real *min_val, real *max_val, int *n, cudaStream_t strm)
void cuda_pwmin_sca2(void *a, real *c, int *n, cudaStream_t stream)
void cuda_cfill(void *a, real *c, int *n, cudaStream_t strm)
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cudaStream_t stream)
void cuda_absval(void *a, int *n, cudaStream_t stream)
void cuda_rzero(void *a, int *n, cudaStream_t strm)
void cuda_cdiv2(void *a, void *b, real *c, int *n, cudaStream_t strm)
void cuda_cdiv(void *a, real *c, int *n, cudaStream_t strm)
void cuda_global_reduce_max(real *bufred, void *bufred_d, int n, const cudaStream_t stream)
void cuda_iadd(void *a, int *c, int *n, cudaStream_t stream)
void cuda_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_pwmax_vec2(void *a, void *b, int *n, cudaStream_t stream)
void cuda_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, cudaStream_t strm)
void cuda_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cudaStream_t strm)
void cuda_invcol1(void *a, int *n, cudaStream_t strm)
void cuda_radd(void *a, real *c, int *n, cudaStream_t strm)
void cuda_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cudaStream_t strm)
real cuda_glmax(void *a, real *ninf, int *n, cudaStream_t stream)
void cuda_subcol3(void *a, void *b, void *c, int *n, cudaStream_t strm)
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n, cudaStream_t stream)
Object for handling masks in Neko.