35#include <hip/hip_runtime.h>
65 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
82 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
99 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
115 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
128 void *facet,
int *n1,
int *n2,
int *lx,
129 int *ly,
int *lz,
int *m,
133 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
137 (
int *)
mask, (
int *) facet, *n1, *n2, *lx, *ly, *lz,
151 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
168 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
185 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
202 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
206 *c, *size, (
int*)
mask, *mask_size);
224 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
238 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
252 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
266 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
280 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
294 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
309 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
323 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
340 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
370 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
386 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
402 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
420 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
424 (
const real **) p, (
real *) alpha, *
j, *n);
437 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
471 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
488 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
504 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
518 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
532 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
547 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
562 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
577 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
592 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
607 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
622 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
638 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
654 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
667 void *v1,
void *v2,
void *v3,
int *n,
671 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
685 void *v1,
void *v2,
void *v3,
686 void *w1,
void *w2,
void *
w3,
690 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
731 #elif HAVE_DEVICE_MPI
753 #elif HAVE_DEVICE_MPI
774 #elif HAVE_DEVICE_MPI
791 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
792 const int nb = ((*n) + 1024 - 1)/ 1024;
820 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
821 const int nb = ((*n) + 1024 - 1)/ 1024;
853 const int nt = 1024/
pow2;
858 const int nb = ((*n) + nt - 1)/nt;
887 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
888 const int nb = ((*n) + 1024 - 1)/ 1024;
917 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
918 const int nb = ((*n) + 1024 - 1) / 1024;
945 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
946 const int nb = ((*n) + 1024 - 1)/ 1024;
974 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
975 const int nb = ((*n) + 1024 - 1)/ 1024;
1002 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1003 const int nb = ((*n) + 1024 - 1)/ 1024;
1031 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
1049 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1063 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1078 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1092 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1107 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1122 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1137 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1152 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
1168 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, hipStream_t strm)
void hip_global_reduce_add(real *bufred, void *bufred_d, int n, const hipStream_t stream)
void hip_col3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_cwrap(void *a, real *min_val, real *max_val, int *n, hipStream_t strm)
void hip_addsqr2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
real hip_glmax(void *a, real *ninf, int *n, hipStream_t stream)
void hip_rzero(void *a, int *n, hipStream_t strm)
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size, hipStream_t strm)
void hip_pwmax_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
void hip_invcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_absval(void *a, int *n, hipStream_t stream)
void hip_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, hipStream_t strm)
real hip_glsubnorm2(void *a, void *b, int *n, hipStream_t stream)
void hip_addcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, hipStream_t strm)
void hip_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, hipStream_t strm)
void hip_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_add2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
real hip_glsum(void *a, int *n, hipStream_t stream)
void hip_add4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
void hip_cadd2(void *a, void *b, real *c, int *n, hipStream_t strm)
real hip_vlsc3(void *u, void *v, void *w, int *n, hipStream_t stream)
void hip_cdiv2(void *a, void *b, real *c, int *n, hipStream_t strm)
void hip_pwmin_sca2(void *a, real *c, int *n, hipStream_t stream)
void hip_cmult2(void *a, void *b, real *c, int *n, hipStream_t strm)
real hip_glsc3(void *a, void *b, void *c, int *n, hipStream_t stream)
void hip_invcol1(void *a, int *n, hipStream_t strm)
void hip_invcol2(void *a, void *b, int *n, hipStream_t strm)
void hip_global_reduce_max(real *bufred, void *bufred_d, int n, const hipStream_t stream)
void hip_cdiv(void *a, real *c, int *n, hipStream_t strm)
void hip_sub2(void *a, void *b, int *n, hipStream_t strm)
real hip_glsc2(void *a, void *b, int *n, hipStream_t stream)
void hip_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, hipStream_t strm)
void hip_col2(void *a, void *b, int *n, hipStream_t strm)
real hip_glmin(void *a, real *pinf, int *n, hipStream_t stream)
void hip_pwmin_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
void hip_addcol3s2(void *a, void *b, void *c, real *s, int *n, hipStream_t strm)
void hip_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, hipStream_t stream)
void hip_cfill(void *a, real *c, int *n, hipStream_t strm)
void hip_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_add3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_pwmax_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
void hip_add2(void *a, void *b, int *n, hipStream_t strm)
void hip_copy(void *a, void *b, int *n, hipStream_t strm)
void hip_subcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_pwmax_sca2(void *a, real *c, int *n, hipStream_t stream)
void hip_iadd(void *a, int *c, int *n, hipStream_t stream)
void hip_pwmin_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
void hip_add2s1(void *a, void *b, real *c1, int *n, hipStream_t strm)
void hip_radd(void *a, real *c, int *n, hipStream_t strm)
void hip_redbuf_check_alloc(int nb)
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, hipStream_t strm)
void hip_addcol4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
void hip_pwmax_vec2(void *a, void *b, int *n, hipStream_t stream)
void hip_pwmin_vec2(void *a, void *b, int *n, hipStream_t stream)
void hip_sub3(void *a, void *b, void *c, int *n, hipStream_t strm)
void hip_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, hipStream_t strm)
void hip_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
void hip_cmult(void *a, real *c, int *n, hipStream_t strm)
void hip_global_reduce_min(real *bufred, void *bufred_d, int n, const hipStream_t stream)
Object for handling masks in Neko.