49 for (
int i = idx;
i < n;
i +=
str) {
67 for (
int i = idx;
i < n;
i +=
str) {
83 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
97 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
__global__ void cheby_part1(T *__restrict__ d, T *__restrict__ x, const T inv_tha, const int n)
__global__ void cheby_part2(T *__restrict__ d, T *__restrict__ w, T *__restrict__ x, const T tmp1, const T tmp2, const int n)
void cuda_cheby_part1(void *d, void *x, real *inv_tha, int *n, cudaStream_t strm)
void cuda_cheby_part2(void *d, void *w, void *x, real *tmp1, real *tmp2, int *n, cudaStream_t strm)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)