1 #ifndef __MATH_OPGRAD_KERNEL_H__
2 #define __MATH_OPGRAD_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
45 const T * __restrict__
u,
46 const T * __restrict__
dx,
47 const T * __restrict__
dy,
48 const T * __restrict__
dz,
49 const T * __restrict__
drdx,
50 const T * __restrict__
dsdx,
51 const T * __restrict__
dtdx,
52 const T * __restrict__
drdy,
53 const T * __restrict__
dsdy,
54 const T * __restrict__
dtdy,
55 const T * __restrict__
drdz,
56 const T * __restrict__
dsdz,
57 const T * __restrict__
dtdz,
58 const T * __restrict__
w3) {
60 __shared__ T
shu[LX * LX * LX];
62 __shared__ T
shdx[LX * LX];
63 __shared__ T
shdy[LX * LX];
64 __shared__ T
shdz[LX * LX];
69 const int e = blockIdx.x;
70 const int iii = threadIdx.x;
71 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
73 if (iii < (LX * LX)) {
80 while(
j < (LX * LX * LX)) {
81 shu[
j] =
u[
j +
e * LX * LX * LX];
87 for (
int n = 0; n < nchunks; n++) {
88 const int ijk = iii + n * CHUNKS;
89 const int jk = ijk / LX;
93 if (
i < LX &&
j < LX && k < LX ) {
97 for (
int l = 0; l < LX; l++) {
98 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
99 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
100 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
103 ux[ijk +
e * LX * LX * LX] =
w3[ijk]
104 * (
drdx[ijk +
e * LX * LX * LX] * rtmp
105 +
dsdx[ijk +
e * LX * LX * LX] * stmp
106 +
dtdx[ijk +
e * LX * LX * LX] * ttmp);
108 uy[ijk +
e * LX * LX * LX] =
w3[ijk]
109 * (
drdy[ijk +
e * LX * LX * LX] * rtmp
110 +
dsdy[ijk +
e * LX * LX * LX] * stmp
111 +
dtdy[ijk +
e * LX * LX * LX] * ttmp);
113 uz[ijk +
e * LX * LX * LX] =
w3[ijk]
114 * (
drdz[ijk +
e * LX * LX * LX] * rtmp
115 +
dsdz[ijk +
e * LX * LX * LX] * stmp
116 +
dtdz[ijk +
e * LX * LX * LX] * ttmp);
123 template<
typename T, const
int LX >
125 opgrad_kernel_kstep(T * __restrict__ ux,
128 const T * __restrict__
u,
129 const T * __restrict__
dx,
130 const T * __restrict__
dy,
131 const T * __restrict__
dz,
141 const T * __restrict__
w3) {
143 __shared__ T
shu[LX * LX];
149 const int e = blockIdx.x;
150 const int j = threadIdx.y;
151 const int i = threadIdx.x;
162 for (
int k = 0; k < LX; ++k) {
169 for (
int k = 0; k < LX; ++k) {
170 const int ijk =
ij + k*LX*LX;
171 const T W3 =
w3[ijk];
174 for (
int l = 0; l < LX; l++) {
175 ttmp +=
shdz[k+l*LX] *
ru[l];
182 for (
int l = 0; l < LX; l++) {
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void __launch_bounds__(LX *LX, 3) opgrad_kernel_kstep(T *__restrict__ ux
__global__ void T *__restrict__ uy
__global__ void opgrad_kernel_1d(T *__restrict__ ux, T *__restrict__ uy, T *__restrict__ uz, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ w3)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ uz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__shared__ T shdz[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__shared__ T shdy[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdx[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3