1 #ifndef __MATH_CONV1_KERNEL_H__
2 #define __MATH_CONV1_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
u,
44 const T * __restrict__
vx,
45 const T * __restrict__
vy,
46 const T * __restrict__
vz,
47 const T * __restrict__
dx,
48 const T * __restrict__
dy,
49 const T * __restrict__
dz,
50 const T * __restrict__
drdx,
51 const T * __restrict__
dsdx,
52 const T * __restrict__
dtdx,
53 const T * __restrict__
drdy,
54 const T * __restrict__
dsdy,
55 const T * __restrict__
dtdy,
56 const T * __restrict__
drdz,
57 const T * __restrict__
dsdz,
58 const T * __restrict__
dtdz,
59 const T * __restrict__
jacinv) {
61 __shared__ T
shu[LX * LX * LX];
63 __shared__ T shvx[LX * LX * LX];
64 __shared__ T shvy[LX * LX * LX];
65 __shared__ T shvz[LX * LX * LX];
67 __shared__ T
shdx[LX * LX];
68 __shared__ T
shdy[LX * LX];
69 __shared__ T
shdz[LX * LX];
71 __shared__ T shjacinv[LX * LX * LX];
73 const int e = blockIdx.x;
74 const int iii = threadIdx.x;
75 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
76 const int ele =
e*LX*LX*LX;
78 if (iii < (LX * LX)) {
85 while(l < (LX * LX * LX)) {
88 shvx[l] =
vx[l +
ele];
89 shvy[l] =
vy[l +
ele];
90 shvz[l] =
vz[l +
ele];
99 for (
int n = 0; n < nchunks; n++) {
100 const int ijk = iii + n * CHUNKS;
101 const int jk = ijk / LX;
102 const int i = ijk - jk * LX;
103 const int k = jk / LX;
104 const int j = jk - k * LX;
105 if (
i < LX &&
j < LX && k < LX) {
109 for (
int l = 0; l < LX; l++) {
110 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
111 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
112 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
115 du[ijk +
e * LX * LX * LX] = shjacinv[ijk] *
116 (shvx[ijk] * (
drdx[ijk +
ele] * rtmp
119 + shvy[ijk] * (
drdy[ijk +
ele] * rtmp
122 + shvz[ijk] * (
drdz[ijk +
ele] * rtmp
129 template<
typename T, const
int LX >
131 conv1_kernel_kstep(T * __restrict__ du,
132 const T * __restrict__
u,
133 const T * __restrict__
vx,
134 const T * __restrict__
vy,
135 const T * __restrict__
vz,
136 const T * __restrict__
dx,
137 const T * __restrict__
dy,
138 const T * __restrict__
dz,
150 __shared__ T
shu[LX * LX];
156 const int e = blockIdx.x;
157 const int j = threadIdx.y;
158 const int i = threadIdx.x;
173 for (
int k = 0; k < LX; ++k) {
184 for (
int k = 0; k < LX; ++k) {
185 const int ijk =
ij + k*LX*LX;
188 for (
int l = 0; l < LX; l++) {
189 ttmp +=
shdz[k+l*LX] *
ru[l];
196 for (
int l = 0; l < LX; l++) {
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__shared__ T shdz[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ vz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void const T *__restrict__ const T *__restrict__ vx
__global__ void conv1_kernel_1d(T *__restrict__ du, const T *__restrict__ u, const T *__restrict__ vx, const T *__restrict__ vy, const T *__restrict__ vz, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ jacinv)
__shared__ T shdx[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void __launch_bounds__(LX *LX, 3) conv1_kernel_kstep(T *__restrict__ du
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ vy
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz