1#ifndef __KRYLOV_FUSEDCG_CPLD_KERNEL_H__
2#define __KRYLOV_FUSEDCG_CPLD_KERNEL_H__
55 for (
int i = idx;
i < n;
i+=
str) {
80 for (
int i = idx;
i < n;
i+=
str) {
81 p1[
i] = beta*
po1[
i] + z1[
i];
82 p2[
i] = beta*
po2[
i] + z2[
i];
83 p3[
i] = beta*
po3[
i] + z3[
i];
105 for (
int i = idx;
i < n;
i+=
str) {
145 for (
int i = idx;
i < n;
i+=
str) {
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void fusedcg_cpld_update_p_kernel(T *__restrict__ p1, T *__restrict__ p2, T *__restrict__ p3, const T *__restrict__ z1, const T *__restrict__ z2, const T *__restrict__ z3, const T *__restrict__ po1, const T *__restrict__ po2, const T *__restrict__ po3, const T beta, const int n)
__global__ void fusedcg_cpld_part2_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ b, const T *__restrict__ c1, const T *__restrict__ c2, const T *__restrict__ c3, const T alpha, T *buf_h, const int n)
__global__ void fusedcg_cpld_update_x_kernel(T *__restrict__ x1, T *__restrict__ x2, T *__restrict__ x3, const T **p1, const T **p2, const T **p3, const T *__restrict__ alpha, const int p_cur, const int n)
__global__ void fusedcg_cpld_part1_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, T *__restrict__ b1, T *__restrict__ b2, T *__restrict__ b3, T *__restrict__ tmp, const int n)