1 #ifndef __KRYLOV_PIPECG_KERNEL_H__
2 #define __KRYLOV_PIPECG_KERNEL_H__
42 template<
typename T >
52 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
53 const int str = blockDim.x * gridDim.x;
56 for (
int i = idx;
i < n;
i+= str) {
59 for (
int j = 0;
j < p_cur;
j ++) {
60 p[
i] = beta[
j]*p[
i] +
u[p_prev][
i];
65 u[p_space][
i] =
u[p_space-1][
i];
71 template<
typename T >
90 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
91 const int str = blockDim.x * gridDim.x;
93 const unsigned int lane = threadIdx.x % warpSize;
94 const unsigned int wid = threadIdx.x / warpSize;
96 __shared__ T buf1[32];
97 __shared__ T buf2[32];
98 __shared__ T buf3[32];
103 for (
int i = idx;
i < n;
i+= str) {
104 z[
i] = beta * z[
i] + ni[
i];
105 q[
i] = beta * q[
i] + mi[
i];
106 s[
i] = beta * s[
i] +
w[
i];
107 r[
i] = r[
i] - alpha * s[
i];
108 u2[
i] = u1[
i] - alpha * q[
i];
109 w[
i] =
w[
i] - alpha * z[
i];
110 tmp1 = tmp1 + r[
i] * mult[
i] * u2[
i];
111 tmp2 = tmp2 +
w[
i] * mult[
i] * u2[
i];
112 tmp3 = tmp3 + r[
i] * mult[
i] * r[
i];
116 tmp1 = reduce_warp<T>(tmp1);
117 tmp2 = reduce_warp<T>(tmp2);
118 tmp3 = reduce_warp<T>(tmp3);
126 tmp1 = (threadIdx.x < blockDim.x / warpSize) ? buf1[lane] : 0;
127 tmp2 = (threadIdx.x < blockDim.x / warpSize) ? buf2[lane] : 0;
128 tmp3 = (threadIdx.x < blockDim.x / warpSize) ? buf3[lane] : 0;
130 tmp1 = reduce_warp<T>(tmp1);
131 tmp2 = reduce_warp<T>(tmp2);
132 tmp3 = reduce_warp<T>(tmp3);
135 if (threadIdx.x == 0) {
136 buf_h1[blockIdx.x] = tmp1;
137 buf_h2[blockIdx.x] = tmp2;
138 buf_h3[blockIdx.x] = tmp3;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void const T *__restrict__ x
__global__ void pipecg_vecops_kernel(T *__restrict__ p, T *__restrict__ q, T *__restrict__ r, T *__restrict__ s, T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ w, T *__restrict__ z, T *__restrict__ ni, T *__restrict__ mi, const T alpha, const T beta, const T *mult, T *buf_h1, T *buf_h2, T *buf_h3, const int n)
__global__ void cg_update_xp_kernel(T *__restrict__ x, T *__restrict__ p, T **__restrict__ u, const T *alpha, const T *beta, const int p_cur, const int p_space, const int n)