1 #ifndef __SEM_COEF_KERNEL_H__
2 #define __SEM_COEF_KERNEL_H__
40 template<
typename T, const
int LX, const
int CHUNKS >
47 const T * __restrict__
drdx,
48 const T * __restrict__
drdy,
49 const T * __restrict__
drdz,
50 const T * __restrict__
dsdx,
51 const T * __restrict__
dsdy,
52 const T * __restrict__
dsdz,
53 const T * __restrict__
dtdx,
54 const T * __restrict__
dtdy,
55 const T * __restrict__
dtdz,
56 const T * __restrict__
jacinv,
57 const T * __restrict__
w3,
62 const int e = blockIdx.x;
63 const int iii = threadIdx.x;
64 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
66 __shared__ T shw3[LX * LX * LX];
69 while(
j < (LX * LX * LX)) {
70 const int i =
j +
e * LX * LX * LX;
86 for (
int n = 0; n < nchunks; n++) {
87 const int ijk = iii + n * CHUNKS;
88 const int jk = ijk / LX;
92 if (
i < LX &&
j < LX && k < LX) {
93 G11[ijk +
e * LX * LX * LX] *= shw3[ijk];
94 G12[ijk +
e * LX * LX * LX] *= shw3[ijk];
95 G13[ijk +
e * LX * LX * LX] *= shw3[ijk];
96 G22[ijk +
e * LX * LX * LX] *= shw3[ijk];
97 G23[ijk +
e * LX * LX * LX] *= shw3[ijk];
98 G33[ijk +
e * LX * LX * LX] *= shw3[ijk];
106 template<
typename T, const
int LX, const
int CHUNKS >
108 T * __restrict__ dydr,
109 T * __restrict__ dzdr,
110 T * __restrict__ dxds,
111 T * __restrict__ dyds,
112 T * __restrict__ dzds,
113 T * __restrict__ dxdt,
114 T * __restrict__ dydt,
115 T * __restrict__ dzdt,
116 const T * __restrict__
dx,
117 const T * __restrict__
dy,
118 const T * __restrict__
dz,
119 const T * __restrict__
x,
120 const T * __restrict__ y,
121 const T * __restrict__ z) {
125 const int e = blockIdx.x;
126 const int iii = threadIdx.x;
127 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
129 __shared__ T
shdx[LX * LX];
130 __shared__ T
shdy[LX * LX];
131 __shared__ T
shdz[LX * LX];
133 __shared__ T
shu[LX * LX * LX];
135 if (iii < (LX * LX)) {
142 while(
j < (LX * LX * LX)) {
143 shu[
j] =
x[
j +
e * LX * LX * LX];
149 for (
int n = 0; n < nchunks; n++) {
150 const int ijk = iii + n * CHUNKS;
151 const int jk = ijk / LX;
155 if (
i < LX &&
j < LX && k < LX) {
159 for (
int l = 0; l < LX; l++) {
160 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
161 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
162 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
164 dxdr[ijk +
e * LX * LX * LX] = rtmp;
165 dxds[ijk +
e * LX * LX * LX] = stmp;
166 dxdt[ijk +
e * LX * LX * LX] = ttmp;
173 while(
j < (LX * LX * LX)) {
174 shu[
j] = y[
j +
e * LX * LX * LX];
180 for (
int n = 0; n < nchunks; n++) {
181 const int ijk = iii + n * CHUNKS;
182 const int jk = ijk / LX;
186 if (
i < LX &&
j < LX && k < LX) {
190 for (
int l = 0; l < LX; l++) {
191 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
192 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
193 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
195 dydr[ijk +
e * LX * LX * LX] = rtmp;
196 dyds[ijk +
e * LX * LX * LX] = stmp;
197 dydt[ijk +
e * LX * LX * LX] = ttmp;
204 while(
j < (LX * LX * LX)) {
205 shu[
j] = z[
j +
e * LX * LX * LX];
211 for (
int n = 0; n < nchunks; n++) {
212 const int ijk = iii + n * CHUNKS;
213 const int jk = ijk / LX;
217 if (
i < LX &&
j < LX && k < LX) {
221 for (
int l = 0; l < LX; l++) {
222 rtmp +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
223 stmp +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
224 ttmp +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
226 dzdr[ijk +
e * LX * LX * LX] = rtmp;
227 dzds[ijk +
e * LX * LX * LX] = stmp;
228 dzdt[ijk +
e * LX * LX * LX] = ttmp;
236 template<
typename T >
239 T * __restrict__
drdx,
240 T * __restrict__
drdy,
241 T * __restrict__
drdz,
242 T * __restrict__
dsdx,
243 T * __restrict__
dsdy,
244 T * __restrict__
dsdz,
245 T * __restrict__
dtdx,
246 T * __restrict__
dtdy,
247 T * __restrict__
dtdz,
248 const T * __restrict__ dxdr,
249 const T * __restrict__ dydr,
250 const T * __restrict__ dzdr,
251 const T * __restrict__ dxds,
252 const T * __restrict__ dyds,
253 const T * __restrict__ dzds,
254 const T * __restrict__ dxdt,
255 const T * __restrict__ dydt,
256 const T * __restrict__ dzdt,
259 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
260 const int str = blockDim.x * gridDim.x;
263 for (
int i = idx;
i < n;
i += str) {
264 jac[
i] = (dxdr[
i] * dyds[
i] * dzdt[
i])
265 + (dxdt[
i] * dydr[
i] * dzds[
i])
266 + (dxds[
i] * dydt[
i] * dzdr[
i])
267 - (dxdr[
i] * dydt[
i] * dzds[
i])
268 - (dxds[
i] * dydr[
i] * dzdt[
i])
269 - (dxdt[
i] * dyds[
i] * dzdr[
i]);
272 drdx[
i] = dyds[
i]*dzdt[
i] - dydt[
i]*dzds[
i];
273 drdy[
i] = dxdt[
i]*dzds[
i] - dxds[
i]*dzdt[
i];
274 drdz[
i] = dxds[
i]*dydt[
i] - dxdt[
i]*dyds[
i];
275 dsdx[
i] = dydt[
i]*dzdr[
i] - dydr[
i]*dzdt[
i];
276 dsdy[
i] = dxdr[
i]*dzdt[
i] - dxdt[
i]*dzdr[
i];
277 dsdz[
i] = dxdt[
i]*dydr[
i] - dxdr[
i]*dydt[
i];
278 dtdx[
i] = dydr[
i]*dzds[
i] - dyds[
i]*dzdr[
i];
279 dtdy[
i] = dxds[
i]*dzdr[
i] - dxdr[
i]*dzds[
i];
280 dtdz[
i] = dxdr[
i]*dyds[
i] - dxds[
i]*dydr[
i];
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__shared__ T shdy[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdz[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void coef_generate_dxyz_kernel(T *__restrict__ dxdr, T *__restrict__ dydr, T *__restrict__ dzdr, T *__restrict__ dxds, T *__restrict__ dyds, T *__restrict__ dzds, T *__restrict__ dxdt, T *__restrict__ dydt, T *__restrict__ dzdt, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ x, const T *__restrict__ y, const T *__restrict__ z)
__global__ void coef_generate_drst_kernel(T *__restrict__ jac, T *__restrict__ jacinv, T *__restrict__ drdx, T *__restrict__ drdy, T *__restrict__ drdz, T *__restrict__ dsdx, T *__restrict__ dsdy, T *__restrict__ dsdz, T *__restrict__ dtdx, T *__restrict__ dtdy, T *__restrict__ dtdz, const T *__restrict__ dxdr, const T *__restrict__ dydr, const T *__restrict__ dzdr, const T *__restrict__ dxds, const T *__restrict__ dyds, const T *__restrict__ dzds, const T *__restrict__ dxdt, const T *__restrict__ dydt, const T *__restrict__ dzdt, const int n)
__global__ void coef_generate_geo_kernel(T *__restrict__ G11, T *__restrict__ G12, T *__restrict__ G13, T *__restrict__ G22, T *__restrict__ G23, T *__restrict__ G33, const T *__restrict__ drdx, const T *__restrict__ drdy, const T *__restrict__ drdz, const T *__restrict__ dsdx, const T *__restrict__ dsdy, const T *__restrict__ dsdz, const T *__restrict__ dtdx, const T *__restrict__ dtdy, const T *__restrict__ dtdz, const T *__restrict__ jacinv, const T *__restrict__ w3, const int gdim)