1 #ifndef __MATH_CFL_KERNEL_H__
2 #define __MATH_CFL_KERNEL_H__
43 val = fmax(val, __shfl_down_sync(0xffffffff, val, 16));
44 val = fmax(val, __shfl_down_sync(0xffffffff, val, 8));
45 val = fmax(val, __shfl_down_sync(0xffffffff, val, 4));
46 val = fmax(val, __shfl_down_sync(0xffffffff, val, 2));
47 val = fmax(val, __shfl_down_sync(0xffffffff, val, 1));
54 template<
typename T >
58 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
59 const int str = blockDim.x * gridDim.x;
60 for (
int i = idx;
i<n ;
i += str)
65 __shared__ T shared[32];
66 unsigned int lane = threadIdx.x % warpSize;
67 unsigned int wid = threadIdx.x / warpSize;
69 cfl = cfl_reduce_warp<T>(
cfl);
74 cfl = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
76 cfl = cfl_reduce_warp<T>(
cfl);
86 template<
typename T, const
int LX, const
int CHUNKS >
88 const T * __restrict__
u,
89 const T * __restrict__
v,
90 const T * __restrict__
w,
91 const T * __restrict__
drdx,
92 const T * __restrict__
dsdx,
93 const T * __restrict__
dtdx,
94 const T * __restrict__
drdy,
95 const T * __restrict__
dsdy,
96 const T * __restrict__
dtdy,
97 const T * __restrict__
drdz,
98 const T * __restrict__
dsdz,
99 const T * __restrict__
dtdz,
100 const T * __restrict__ dr_inv,
101 const T * __restrict__ ds_inv,
102 const T * __restrict__ dt_inv,
103 const T * __restrict__
jacinv,
104 T * __restrict__ cfl_h) {
108 const int e = blockIdx.x;
109 const int iii = threadIdx.x;
110 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
111 const unsigned int lane = threadIdx.x % warpSize;
112 const unsigned int wid = threadIdx.x / warpSize;
114 __shared__ T
shu[LX * LX * LX];
115 __shared__ T
shv[LX * LX * LX];
116 __shared__ T
shw[LX * LX * LX];
118 __shared__ T shdr_inv[LX];
119 __shared__ T shds_inv[LX];
120 __shared__ T shdt_inv[LX];
122 __shared__ T shjacinv[LX * LX * LX];
124 __shared__ T shared[32];
127 shdr_inv[iii] = dr_inv[iii];
128 shds_inv[iii] = ds_inv[iii];
129 shdt_inv[iii] = dt_inv[iii];
133 while(
j < (LX * LX * LX)) {
134 shu[
j] =
u[
j +
e * LX * LX * LX];
135 shv[
j] =
v[
j +
e * LX * LX * LX];
136 shw[
j] =
w[
j +
e * LX * LX * LX];
138 shjacinv[
j] =
jacinv[
j +
e * LX * LX * LX];
146 for (
int n = 0; n < nchunks; n++) {
147 const int ijk = iii + n * CHUNKS;
148 const int jk = ijk / LX;
152 if (
i < LX &&
j < LX && k < LX) {
153 const T cflr = fabs(
dt * ( (
shu[ijk] *
drdx[ijk +
e * LX * LX * LX]
154 +
shv[ijk] *
drdy[ijk +
e * LX * LX * LX]
155 *
shw[ijk] *
drdz[ijk +
e * LX * LX * LX]
156 ) * shjacinv[ijk]) * shdr_inv[
i]);
157 const T cfls = fabs(
dt * ( (
shu[ijk] *
dsdx[ijk +
e * LX * LX * LX]
158 +
shv[ijk] *
dsdy[ijk +
e * LX * LX * LX]
159 +
shw[ijk] *
dsdz[ijk +
e * LX * LX * LX]
160 ) * shjacinv[ijk]) * shds_inv[
j]);
161 const T cflt = fabs(
dt * ( (
shu[ijk] *
dtdx[ijk +
e * LX * LX * LX]
162 +
shv[ijk] *
dtdy[ijk +
e * LX * LX * LX]
163 +
shw[ijk] *
dtdz[ijk +
e * LX * LX * LX]
164 ) * shjacinv[ijk]) * shdt_inv[k]);
166 cfl_tmp = fmax(cflr + cfls + cflt, cfl_tmp);
171 cfl_tmp = cfl_reduce_warp<T>(cfl_tmp);
173 shared[wid] = cfl_tmp;
176 cfl_tmp = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
178 cfl_tmp = cfl_reduce_warp<T>(cfl_tmp);
180 if (threadIdx.x == 0)
181 cfl_h[blockIdx.x] = cfl_tmp;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void cfl_kernel(const T dt, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ dr_inv, const T *__restrict__ ds_inv, const T *__restrict__ dt_inv, const T *__restrict__ jacinv, T *__restrict__ cfl_h)
__global__ void cfl_reduce_kernel(T *bufred, const int n)
__inline__ __device__ T cfl_reduce_warp(T val)
real(kind=rp) function, public cfl(dt, u, v, w, Xh, coef, nelv, gdim)