1 #ifndef __KRYLOV_PIPECG_KERNEL_H__
2 #define __KRYLOV_PIPECG_KERNEL_H__
42 template<
typename T >
52 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
53 const int str = blockDim.x * gridDim.x;
56 for (
int i = idx;
i < n;
i+= str) {
59 for (
int j = 0;
j < p_cur;
j ++) {
60 p[
i] = beta[
j]*p[
i] +
u[p_prev][
i];
65 u[p_space][
i] =
u[p_space-1][
i];
72 template<
typename T >
91 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
92 const int str = blockDim.x * gridDim.x;
94 const unsigned int lane = threadIdx.x % warpSize;
95 const unsigned int wid = threadIdx.x / warpSize;
97 __shared__ T buf1[64];
98 __shared__ T buf2[64];
99 __shared__ T buf3[64];
104 for (
int i = idx;
i < n;
i+= str) {
105 z[
i] = beta * z[
i] + ni[
i];
106 q[
i] = beta * q[
i] + mi[
i];
107 s[
i] = beta * s[
i] +
w[
i];
108 r[
i] = r[
i] - alpha * s[
i];
109 u2[
i] = u1[
i] - alpha * q[
i];
110 w[
i] =
w[
i] - alpha * z[
i];
111 tmp1 = tmp1 + r[
i] * mult[
i] * u2[
i];
112 tmp2 = tmp2 +
w[
i] * mult[
i] * u2[
i];
113 tmp3 = tmp3 + r[
i] * mult[
i] * r[
i];
117 tmp1 = reduce_warp<T>(tmp1);
118 tmp2 = reduce_warp<T>(tmp2);
119 tmp3 = reduce_warp<T>(tmp3);
127 tmp1 = (threadIdx.x < blockDim.x / warpSize) ? buf1[lane] : 0;
128 tmp2 = (threadIdx.x < blockDim.x / warpSize) ? buf2[lane] : 0;
129 tmp3 = (threadIdx.x < blockDim.x / warpSize) ? buf3[lane] : 0;
131 tmp1 = reduce_warp<T>(tmp1);
132 tmp2 = reduce_warp<T>(tmp2);
133 tmp3 = reduce_warp<T>(tmp3);
136 if (threadIdx.x == 0) {
137 buf_h1[blockIdx.x] = tmp1;
138 buf_h2[blockIdx.x] = tmp2;
139 buf_h3[blockIdx.x] = tmp3;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void const T *__restrict__ x
__global__ void pipecg_vecops_kernel(T *__restrict__ p, T *__restrict__ q, T *__restrict__ r, T *__restrict__ s, T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ w, T *__restrict__ z, T *__restrict__ ni, T *__restrict__ mi, const T alpha, const T beta, const T *mult, T *buf_h1, T *buf_h2, T *buf_h3, const int n)
__global__ void cg_update_xp_kernel(T *__restrict__ x, T *__restrict__ p, T **__restrict__ u, const T *alpha, const T *beta, const int p_cur, const int p_space, const int n)