1 #ifndef __KRYLOV_FUSEDCG_CPLD_KERNEL_H__
2 #define __KRYLOV_FUSEDCG_CPLD_KERNEL_H__
42 template<
typename T >
52 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
53 const int str = blockDim.x * gridDim.x;
55 for (
int i = idx;
i < n;
i+= str) {
56 tmp[
i] = a1[
i]*b1[
i] + a2[
i]*b2[
i] + a3[
i]*b3[
i];
64 template<
typename T >
68 const T * __restrict__ z1,
69 const T * __restrict__ z2,
70 const T * __restrict__ z3,
71 const T * __restrict__ po1,
72 const T * __restrict__ po2,
73 const T * __restrict__ po3,
77 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
78 const int str = blockDim.x * gridDim.x;
80 for (
int i = idx;
i < n;
i+= str) {
81 p1[
i] = beta*po1[
i] + z1[
i];
82 p2[
i] = beta*po2[
i] + z2[
i];
83 p3[
i] = beta*po3[
i] + z3[
i];
91 template<
typename T >
98 const T * __restrict__ alpha,
102 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
103 const int str = blockDim.x * gridDim.x;
105 for (
int i = idx;
i < n;
i+= str) {
109 for (
int j = 0;
j < p_cur;
j ++) {
110 tmp1 += p1[
j][
i] * alpha[
j];
111 tmp2 += p2[
j][
i] * alpha[
j];
112 tmp3 += p3[
j][
i] * alpha[
j];
124 template<
typename T>
128 const T * __restrict__ b,
129 const T * __restrict__ c1,
130 const T * __restrict__ c2,
131 const T * __restrict__ c3,
136 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
137 const int str = blockDim.x * gridDim.x;
139 const unsigned int lane = threadIdx.x % warpSize;
140 const unsigned int wid = threadIdx.x / warpSize;
142 __shared__ T
buf[32];
145 for (
int i = idx;
i < n;
i+= str) {
146 T rt1 = a1[
i] - alpha * c1[
i];
147 T rt2 = a2[
i] - alpha * c2[
i];
148 T rt3 = a3[
i] - alpha * c3[
i];
149 tmp = tmp + ((rt1*rt1 + rt2*rt2 + rt3*rt3) * b[
i]);
155 tmp = reduce_warp<T>(tmp);
161 tmp = (threadIdx.x < blockDim.x / warpSize) ?
buf[lane] : 0;
163 tmp = reduce_warp<T>(tmp);
166 if (threadIdx.x == 0) {
167 buf_h[blockIdx.x] = tmp;
__global__ void fusedcg_cpld_update_p_kernel(T *__restrict__ p1, T *__restrict__ p2, T *__restrict__ p3, const T *__restrict__ z1, const T *__restrict__ z2, const T *__restrict__ z3, const T *__restrict__ po1, const T *__restrict__ po2, const T *__restrict__ po3, const T beta, const int n)
__global__ void fusedcg_cpld_part2_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ b, const T *__restrict__ c1, const T *__restrict__ c2, const T *__restrict__ c3, const T alpha, T *buf_h, const int n)
__global__ void fusedcg_cpld_update_x_kernel(T *__restrict__ x1, T *__restrict__ x2, T *__restrict__ x3, const T **p1, const T **p2, const T **p3, const T *__restrict__ alpha, const int p_cur, const int n)
__global__ void fusedcg_cpld_part1_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, T *__restrict__ b1, T *__restrict__ b2, T *__restrict__ b3, T *__restrict__ tmp, const int n)