1 #ifndef __MATH_DUDXYZ_KERNEL_H__
2 #define __MATH_DUDXYZ_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
u,
44 const T * __restrict__
dr,
45 const T * __restrict__
ds,
46 const T * __restrict__
dt,
47 const T * __restrict__
dx,
48 const T * __restrict__
dy,
49 const T * __restrict__
dz,
50 const T * __restrict__
jacinv) {
52 __shared__ T
shu[LX * LX * LX];
53 __shared__ T shdr[LX * LX * LX];
54 __shared__ T shds[LX * LX * LX];
55 __shared__ T shdt[LX * LX * LX];
57 __shared__ T
shdx[LX * LX];
58 __shared__ T
shdy[LX * LX];
59 __shared__ T
shdz[LX * LX];
61 __shared__ T shjacinv[LX * LX * LX];
63 const int e = blockIdx.x;
64 const int iii = threadIdx.x;
65 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
67 if (iii < (LX * LX)) {
74 while(l < (LX * LX * LX)) {
75 shu[l] =
u[l +
e * LX * LX * LX];
76 shdr[l] =
dr[l +
e * LX * LX * LX];
77 shds[l] =
ds[l +
e * LX * LX * LX];
78 shdt[l] =
dt[l +
e * LX * LX * LX];
79 shjacinv[l] =
jacinv[l +
e * LX * LX * LX];
85 for (
int n = 0; n < nchunks; n++) {
86 const int ijk = iii + n * CHUNKS;
87 const int jk = ijk / LX;
88 const int i = ijk - jk * LX;
89 const int k = jk / LX;
90 const int j = jk - k * LX;
91 if (
i < LX &&
j < LX && k < LX) {
95 for (
int l = 0; l < LX; l++) {
96 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
97 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
98 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
100 du[ijk +
e * LX * LX * LX] = ((rtmp * shdr[ijk])
102 + (ttmp * shdt[ijk]))
109 template<
typename T, const
int LX >
111 dudxyz_kernel_kstep(T * __restrict__ du,
112 const T * __restrict__
u,
113 const T * __restrict__
dr,
114 const T * __restrict__
ds,
115 const T * __restrict__
dt,
116 const T * __restrict__
dx,
117 const T * __restrict__
dy,
118 const T * __restrict__
dz,
121 __shared__ T
shu[LX * LX];
127 const int e = blockIdx.x;
128 const int j = threadIdx.y;
129 const int i = threadIdx.x;
144 for (
int k = 0; k < LX; ++k) {
155 for (
int k = 0; k < LX; ++k) {
156 const int ijk =
ij + k*LX*LX;
159 for (
int l = 0; l < LX; l++) {
160 ttmp +=
shdz[k+l*LX] *
ru[l];
167 for (
int l = 0; l < LX; l++) {
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__shared__ T shdz[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__shared__ T shdy[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) dudxyz_kernel_kstep(T *__restrict__ du
__global__ void dudxyz_kernel_1d(T *__restrict__ du, const T *__restrict__ u, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ jacinv)
__shared__ T shdx[LX *LX]
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv