1#ifndef __MATH_DEVICE_MPI_REDUCE_H__ 
    2#define __MATH_DEVICE_MPI_REDUCE_H__ 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void device_mpi_allreduce(void *buf_d, void *buf, int count, size_t nbytes, int op)
 
void device_mpi_allreduce_inplace(void *buf_d, int count, size_t nbytes, int op)