1 #ifndef __MATH_CDTP_KERNEL_H__
2 #define __MATH_CDTP_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
x,
44 const T * __restrict__
dr,
45 const T * __restrict__
ds,
46 const T * __restrict__
dt,
47 const T * __restrict__
dxt,
48 const T * __restrict__
dyt,
49 const T * __restrict__
dzt,
50 const T * __restrict__
w3) {
52 __shared__ T
shdxt[LX * LX];
53 __shared__ T
shdyt[LX * LX];
54 __shared__ T
shdzt[LX * LX];
56 __shared__ T
shtar[LX * LX * LX];
57 __shared__ T
shtas[LX * LX * LX];
58 __shared__ T shtat[LX * LX * LX];
60 const int e = blockIdx.x;
61 const int iii = threadIdx.x;
62 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
64 if (iii < (LX * LX)) {
71 while(l < (LX * LX * LX)) {
72 T wx =
x[l +
e * LX * LX * LX] *
w3[l];
74 shtar[l] = wx*
dr[l +
e * LX * LX * LX];
75 shtas[l] = wx*
ds[l +
e * LX * LX * LX];
76 shtat[l] = wx*
dt[l +
e * LX * LX * LX];
82 for (
int n = 0; n < nchunks; n++) {
83 const int ijk = iii + n * CHUNKS;
84 const int jk = ijk / LX;
85 const int i = ijk - jk * LX;
86 const int k = jk / LX;
87 const int j = jk - k * LX;
88 if (
i < LX &&
j < LX && k < LX && ijk < LX*LX*LX) {
92 for (
int l = 0; l < LX; l++) {
95 ttmp +=
shdzt[k + l * LX] * shtat[
i +
j*LX + l*LX*LX];
97 dtx[ijk +
e * LX * LX * LX] = ( rtmp + stmp + ttmp );
103 template<
typename T, const
int LX >
105 cdtp_kernel_kstep(T * __restrict__ dtx,
106 const T * __restrict__
x,
107 const T * __restrict__
dr,
108 const T * __restrict__
ds,
109 const T * __restrict__
dt,
110 const T * __restrict__
dxt,
111 const T * __restrict__
dyt,
112 const T * __restrict__
dzt,
113 const T * __restrict__
w3) {
115 __shared__ T
shdxt[LX * LX];
126 const int e = blockIdx.x;
127 const int j = threadIdx.y;
128 const int i = threadIdx.x;
138 for (
int k = 0; k < LX; ++k) {
139 T wx =
x[
ij + k*LX*LX +
ele] *
w3[
ij + k*LX*LX];
149 for (
int k = 0; k < LX; ++k) {
150 const int ijk =
ij + k*LX*LX;
154 for (
int l = 0; l < LX; l++) {
162 for (
int l = 0; l < LX; l++) {
167 dtx[ijk +
ele] = ( rtmp + stmp + ttmp );
__shared__ T shdyt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__shared__ T shtas[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) cdtp_kernel_kstep(T *__restrict__ dtx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__shared__ T shdzt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__shared__ T shtar[LX *LX]
__global__ void cdtp_kernel_1d(T *__restrict__ dtx, const T *__restrict__ x, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ w3)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt