1#ifndef __MATH_DEVICE_MPI_REDUCE_H__
2#define __MATH_DEVICE_MPI_REDUCE_H__
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void device_mpi_allreduce(void *buf_d, void *buf, int count, size_t nbytes, int op)
void device_mpi_allreduce_inplace(void *buf_d, int count, size_t nbytes, int op)