35#include <hip/hip_runtime.h> 
   65    const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
 
 
   82    const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
 
 
   98    const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
 
 
  115    const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
 
 
  132      const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
 
  136                       *c, *size, (
int*)
mask, *mask_size);
 
 
  154    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  168    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  182    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  196    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  210    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  224    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  237    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  254    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  269    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  284    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  300    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  316    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  334    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
  338                       (
const real **) p, (
real *) alpha, *
j, *n);
 
 
  351    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  368    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  385    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  402    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  418    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  432    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  446    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  461    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  476    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  491    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  506    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  521    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  536    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  552    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  568    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  581                 void *v1, 
void *v2, 
void *v3, 
int *n,
 
  585    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  599                  void *v1, 
void *v2, 
void *v3,
 
  600                  void *w1, 
void *w2, 
void *
w3,
 
  604    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  645    #elif HAVE_DEVICE_MPI 
 
  662    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
  663    const int nb = ((*n) + 1024 - 1)/ 1024;
 
 
  691    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
  692    const int nb = ((*n) + 1024 - 1)/ 1024;
 
 
  724    const int nt = 1024/
pow2;
 
  729    const int nb = ((*n) + nt - 1)/nt;
 
 
  758    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
  759    const int nb = ((*n) + 1024 - 1)/ 1024;
 
 
  788      const dim3        nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
  789      const int         nb     = ((*n) + 1024 - 1) / 1024;
 
 
  816    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
  817    const int nb = ((*n) + 1024 - 1)/ 1024;
 
 
  845    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
  863    const dim3        nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  877      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  892      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  906      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  921      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  936      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  951      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  966      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
 
 
  982    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
 
__global__ void const T *__restrict__ x
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
 
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
 
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, hipStream_t strm)
 
void hip_global_reduce_add(real *bufred, void *bufred_d, int n, const hipStream_t stream)
 
void hip_col3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_addsqr2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
 
void hip_rzero(void *a, int *n, hipStream_t strm)
 
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size, hipStream_t strm)
 
void hip_pwmax_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
 
void hip_invcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_absval(void *a, int *n, hipStream_t stream)
 
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, hipStream_t strm)
 
real hip_glsubnorm2(void *a, void *b, int *n, hipStream_t stream)
 
void hip_addcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, hipStream_t strm)
 
void hip_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, hipStream_t strm)
 
void hip_add2s2(void *a, void *b, real *c1, int *n, hipStream_t strm)
 
real hip_glsum(void *a, int *n, hipStream_t stream)
 
void hip_add4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
 
void hip_cadd2(void *a, void *b, real *c, int *n, hipStream_t strm)
 
real hip_vlsc3(void *u, void *v, void *w, int *n, hipStream_t stream)
 
void hip_cdiv2(void *a, void *b, real *c, int *n, hipStream_t strm)
 
void hip_pwmin_sca2(void *a, real *c, int *n, hipStream_t stream)
 
void hip_cmult2(void *a, void *b, real *c, int *n, hipStream_t strm)
 
real hip_glsc3(void *a, void *b, void *c, int *n, hipStream_t stream)
 
void hip_invcol1(void *a, int *n, hipStream_t strm)
 
void hip_invcol2(void *a, void *b, int *n, hipStream_t strm)
 
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
 
void hip_cdiv(void *a, real *c, int *n, hipStream_t strm)
 
void hip_sub2(void *a, void *b, int *n, hipStream_t strm)
 
real hip_glsc2(void *a, void *b, int *n, hipStream_t stream)
 
void hip_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, hipStream_t strm)
 
void hip_col2(void *a, void *b, int *n, hipStream_t strm)
 
void hip_pwmin_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
 
void hip_addcol3s2(void *a, void *b, void *c, real *s, int *n, hipStream_t strm)
 
void hip_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
 
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, hipStream_t stream)
 
void hip_cfill(void *a, real *c, int *n, hipStream_t strm)
 
void hip_masked_atomic_reduction(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
 
void hip_add3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_pwmax_vec3(void *a, void *b, void *c, int *n, hipStream_t stream)
 
void hip_add2(void *a, void *b, int *n, hipStream_t strm)
 
void hip_copy(void *a, void *b, int *n, hipStream_t strm)
 
void hip_subcol3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_pwmax_sca2(void *a, real *c, int *n, hipStream_t stream)
 
void hip_iadd(void *a, int *c, int *n, hipStream_t stream)
 
void hip_pwmin_sca3(void *a, void *b, real *c, int *n, hipStream_t stream)
 
void hip_add2s1(void *a, void *b, real *c1, int *n, hipStream_t strm)
 
void hip_radd(void *a, real *c, int *n, hipStream_t strm)
 
void hip_redbuf_check_alloc(int nb)
 
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n, hipStream_t strm)
 
void hip_addcol4(void *a, void *b, void *c, void *d, int *n, hipStream_t strm)
 
void hip_pwmax_vec2(void *a, void *b, int *n, hipStream_t stream)
 
void hip_pwmin_vec2(void *a, void *b, int *n, hipStream_t stream)
 
void hip_sub3(void *a, void *b, void *c, int *n, hipStream_t strm)
 
void hip_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, hipStream_t strm)
 
void hip_cmult(void *a, real *c, int *n, hipStream_t strm)
 
Object for handling masks in Neko.