1 #ifndef __KRYLOV_FUSEDCG_KERNEL_H__
2 #define __KRYLOV_FUSEDCG_KERNEL_H__
42 template<
typename T >
44 const T * __restrict__ z,
45 const T * __restrict__ po,
49 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
50 const int str = blockDim.x * gridDim.x;
52 for (
int i = idx;
i < n;
i+= str) {
53 p[
i] = beta*po[
i] + z[
i];
61 template<
typename T >
64 const T * __restrict__ alpha,
68 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
69 const int str = blockDim.x * gridDim.x;
71 for (
int i = idx;
i < n;
i+= str) {
73 for (
int j = 0;
j < p_cur;
j ++) {
74 tmp += p[
j][
i] * alpha[
j];
86 const T * __restrict__ b,
87 const T * __restrict__ c,
92 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
93 const int str = blockDim.x * gridDim.x;
95 const unsigned int lane = threadIdx.x % warpSize;
96 const unsigned int wid = threadIdx.x / warpSize;
101 for (
int i = idx;
i < n;
i+= str) {
102 T rt = a[
i] - alpha * c[
i];
103 tmp = tmp + rt * b[
i] * rt;
107 tmp = reduce_warp<T>(tmp);
113 tmp = (threadIdx.x < blockDim.x / warpSize) ?
buf[lane] : 0;
115 tmp = reduce_warp<T>(tmp);
118 if (threadIdx.x == 0) {
119 buf_h[blockIdx.x] = tmp;
__global__ void const T *__restrict__ x
__global__ void fusedcg_update_p_kernel(T *__restrict__ p, const T *__restrict__ z, const T *__restrict__ po, const T beta, const int n)
__global__ void fusedcg_part2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T alpha, T *buf_h, const int n)
__global__ void fusedcg_update_x_kernel(T *__restrict__ x, const T **p, const T *__restrict__ alpha, const int p_cur, const int n)