1 #ifndef __KRYLOV_GMRES_KERNEL_H__
2 #define __KRYLOV_GMRES_KERNEL_H__
43 template<
typename T >
45 T *
const * __restrict__
v,
46 const T * __restrict__ mult,
47 const T * __restrict__ h,
48 T * __restrict__ buf_h1,
52 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
53 const int str = blockDim.x * gridDim.x;
55 const unsigned int lane = threadIdx.x % warpSize;
56 const unsigned int wid = threadIdx.x / warpSize;
58 __shared__ T shared[64];
61 for (
int i = idx;
i < n;
i+= str) {
63 for (
int k = 0; k <
j; k ++) {
67 tmp1 +=
w[
i]*
w[
i]*mult[
i];
70 tmp1 = reduce_warp<T>(tmp1);
75 tmp1 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
77 tmp1 = reduce_warp<T>(tmp1);
80 buf_h1[blockIdx.x] = tmp1;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void gmres_part2_kernel(T *__restrict__ w, T *const *__restrict__ v, const T *__restrict__ mult, const T *__restrict__ h, T *__restrict__ buf_h1, const int j, const int n)