35#include <hip/hip_runtime.h> 
   62    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
   75    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
 
   87                          void *alpha_d , 
real *alpha, 
int *
p_cur, 
int * n) {
 
   90    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
   91    const int nb = ((*n) + 1024 - 1)/ 1024;
 
 
__global__ void const T *__restrict__ x
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
 
void device_nccl_allreduce(void *sbuf_d, void *rbuf_d, int count, int nbytes, int op, void *stream)
 
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
 
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
 
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)