1 #ifndef __MATH_CDTP_KERNEL_H__
2 #define __MATH_CDTP_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
x,
44 const T * __restrict__
dr,
45 const T * __restrict__
ds,
46 const T * __restrict__
dt,
47 const T * __restrict__
dxt,
48 const T * __restrict__
dyt,
49 const T * __restrict__
dzt,
50 const T * __restrict__
B,
51 const T * __restrict__
jac) {
53 __shared__ T
shdxt[LX * LX];
54 __shared__ T
shdyt[LX * LX];
55 __shared__ T
shdzt[LX * LX];
57 __shared__ T
shtar[LX * LX * LX];
58 __shared__ T
shtas[LX * LX * LX];
59 __shared__ T shtat[LX * LX * LX];
61 const int e = blockIdx.x;
62 const int iii = threadIdx.x;
63 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
65 if (iii < (LX * LX)) {
72 while(l < (LX * LX * LX)) {
74 T wx = (
x[l +
e * LX * LX * LX] *
B[l +
e * LX * LX * LX]) /
75 jac[l +
e * LX * LX * LX];
77 shtar[l] = wx*
dr[l +
e * LX * LX * LX];
78 shtas[l] = wx*
ds[l +
e * LX * LX * LX];
79 shtat[l] = wx*
dt[l +
e * LX * LX * LX];
85 for (
int n = 0; n < nchunks; n++) {
86 const int ijk = iii + n * CHUNKS;
87 const int jk = ijk / LX;
88 const int i = ijk - jk * LX;
89 const int k = jk / LX;
90 const int j = jk - k * LX;
91 if (
i < LX &&
j < LX && k < LX && ijk < LX*LX*LX) {
95 for (
int l = 0; l < LX; l++) {
98 ttmp +=
shdzt[k + l * LX] * shtat[
i +
j*LX + l*LX*LX];
100 dtx[ijk +
e * LX * LX * LX] = ( rtmp + stmp + ttmp );
106 template<
typename T, const
int LX >
108 cdtp_kernel_kstep(T * __restrict__ dtx,
109 const T * __restrict__
x,
110 const T * __restrict__
dr,
111 const T * __restrict__
ds,
112 const T * __restrict__
dt,
113 const T * __restrict__
dxt,
114 const T * __restrict__
dyt,
115 const T * __restrict__
dzt,
116 const T * __restrict__
B,
117 const T * __restrict__
jac) {
119 __shared__ T
shdxt[LX * LX];
130 const int e = blockIdx.x;
131 const int j = threadIdx.y;
132 const int i = threadIdx.x;
142 for (
int k = 0; k < LX; ++k) {
143 T wx = (
x[
ij + k*LX*LX +
ele] *
B[
ij + k*LX*LX +
ele]) /
154 for (
int k = 0; k < LX; ++k) {
155 const int ijk =
ij + k*LX*LX;
159 for (
int l = 0; l < LX; l++) {
167 for (
int l = 0; l < LX; l++) {
172 dtx[ijk +
ele] = ( rtmp + stmp + ttmp );
__shared__ T shdyt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__shared__ T shtas[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ B
__global__ void __launch_bounds__(LX *LX, 3) cdtp_kernel_kstep(T *__restrict__ dtx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jac
__shared__ T shdzt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__shared__ T shtar[LX *LX]
__global__ void cdtp_kernel_1d(T *__restrict__ dtx, const T *__restrict__ x, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ B, const T *__restrict__ jac)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt