1 #ifndef __MATH_CFL_KERNEL_H__
2 #define __MATH_CFL_KERNEL_H__
43 val = fmax(val, __shfl_down(val, 32));
44 val = fmax(val, __shfl_down(val, 16));
45 val = fmax(val, __shfl_down(val, 8));
46 val = fmax(val, __shfl_down(val, 4));
47 val = fmax(val, __shfl_down(val, 2));
48 val = fmax(val, __shfl_down(val, 1));
55 template<
typename T >
59 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
60 const int str = blockDim.x * gridDim.x;
61 for (
int i = idx;
i<n ;
i += str)
66 __shared__ T shared[64];
67 unsigned int lane = threadIdx.x % warpSize;
68 unsigned int wid = threadIdx.x / warpSize;
70 cfl = cfl_reduce_warp<T>(
cfl);
75 cfl = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
77 cfl = cfl_reduce_warp<T>(
cfl);
87 template<
typename T, const
int LX, const
int CHUNKS >
89 const T * __restrict__
u,
90 const T * __restrict__
v,
91 const T * __restrict__
w,
92 const T * __restrict__
drdx,
93 const T * __restrict__
dsdx,
94 const T * __restrict__
dtdx,
95 const T * __restrict__
drdy,
96 const T * __restrict__
dsdy,
97 const T * __restrict__
dtdy,
98 const T * __restrict__
drdz,
99 const T * __restrict__
dsdz,
100 const T * __restrict__
dtdz,
101 const T * __restrict__ dr_inv,
102 const T * __restrict__ ds_inv,
103 const T * __restrict__ dt_inv,
104 const T * __restrict__
jacinv,
105 T * __restrict__ cfl_h) {
109 const int e = blockIdx.x;
110 const int iii = threadIdx.x;
111 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
112 const unsigned int lane = threadIdx.x % warpSize;
113 const unsigned int wid = threadIdx.x / warpSize;
115 __shared__ T
shu[LX * LX * LX];
116 __shared__ T
shv[LX * LX * LX];
117 __shared__ T
shw[LX * LX * LX];
119 __shared__ T shdr_inv[LX];
120 __shared__ T shds_inv[LX];
121 __shared__ T shdt_inv[LX];
123 __shared__ T shjacinv[LX * LX * LX];
125 __shared__ T shared[64];
128 shdr_inv[iii] = dr_inv[iii];
129 shds_inv[iii] = ds_inv[iii];
130 shdt_inv[iii] = dt_inv[iii];
134 while(
j < (LX * LX * LX)) {
135 shu[
j] =
u[
j +
e * LX * LX * LX];
136 shv[
j] =
v[
j +
e * LX * LX * LX];
137 shw[
j] =
w[
j +
e * LX * LX * LX];
139 shjacinv[
j] =
jacinv[
j +
e * LX * LX * LX];
147 for (
int n = 0; n < nchunks; n++) {
148 const int ijk = iii + n * CHUNKS;
149 const int jk = ijk / LX;
153 if (
i < LX &&
j < LX && k < LX) {
154 const T cflr = fabs(
dt * ( (
shu[ijk] *
drdx[ijk +
e * LX * LX * LX]
155 +
shv[ijk] *
drdy[ijk +
e * LX * LX * LX]
156 *
shw[ijk] *
drdz[ijk +
e * LX * LX * LX]
157 ) * shjacinv[ijk]) * shdr_inv[
i]);
158 const T cfls = fabs(
dt * ( (
shu[ijk] *
dsdx[ijk +
e * LX * LX * LX]
159 +
shv[ijk] *
dsdy[ijk +
e * LX * LX * LX]
160 +
shw[ijk] *
dsdz[ijk +
e * LX * LX * LX]
161 ) * shjacinv[ijk]) * shds_inv[
j]);
162 const T cflt = fabs(
dt * ( (
shu[ijk] *
dtdx[ijk +
e * LX * LX * LX]
163 +
shv[ijk] *
dtdy[ijk +
e * LX * LX * LX]
164 +
shw[ijk] *
dtdz[ijk +
e * LX * LX * LX]
165 ) * shjacinv[ijk]) * shdt_inv[k]);
167 cfl_tmp = fmax(cflr + cfls + cflt, cfl_tmp);
172 cfl_tmp = cfl_reduce_warp<T>(cfl_tmp);
174 shared[wid] = cfl_tmp;
177 cfl_tmp = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
179 cfl_tmp = cfl_reduce_warp<T>(cfl_tmp);
181 if (threadIdx.x == 0)
182 cfl_h[blockIdx.x] = cfl_tmp;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void cfl_kernel(const T dt, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ dr_inv, const T *__restrict__ ds_inv, const T *__restrict__ dt_inv, const T *__restrict__ jacinv, T *__restrict__ cfl_h)
__global__ void cfl_reduce_kernel(T *bufred, const int n)
__inline__ __device__ T cfl_reduce_warp(T val)
real(kind=rp) function, public cfl(dt, u, v, w, Xh, coef, nelv, gdim)