1 #ifndef __MATH_AX_HELM_FULL_KERNEL_H__
2 #define __MATH_AX_HELM_FULL_KERNEL_H__
41 template<
typename T, const
int LX >
43 ax_helm_stress_kernel_vector_kstep(T * __restrict__ au,
46 const T * __restrict__
u,
47 const T * __restrict__
v,
48 const T * __restrict__
w,
49 const T * __restrict__
dx,
50 const T * __restrict__
dy,
51 const T * __restrict__
dz,
52 const T * __restrict__
h1,
53 const T * __restrict__
drdx,
54 const T * __restrict__
drdy,
55 const T * __restrict__
drdz,
56 const T * __restrict__
dsdx,
57 const T * __restrict__
dsdy,
58 const T * __restrict__
dsdz,
59 const T * __restrict__
dtdx,
60 const T * __restrict__
dtdy,
61 const T * __restrict__
dtdz,
65 __shared__ T
shdx[LX * LX];
66 __shared__ T
shdy[LX * LX];
67 __shared__ T
shdz[LX * LX];
69 __shared__ T
shu[LX * LX];
70 __shared__ T
shur[LX * LX];
71 __shared__ T
shus[LX * LX];
73 __shared__ T
shv[LX * LX];
74 __shared__ T
shvr[LX * LX];
75 __shared__ T
shvs[LX * LX];
77 __shared__ T
shw[LX * LX];
78 __shared__ T
shwr[LX * LX];
79 __shared__ T
shws[LX * LX];
93 const int e = blockIdx.x;
94 const int j = threadIdx.y;
95 const int i = threadIdx.x;
97 const int ele =
e*LX*LX*LX;
104 for(
int k = 0; k < LX; ++k){
118 for (
int k = 0; k < LX; ++k){
119 const int ijk =
ij + k*LX*LX;
120 const T drdx_local =
drdx[ijk+
ele];
121 const T drdy_local =
drdy[ijk+
ele];
122 const T drdz_local =
drdz[ijk+
ele];
123 const T dsdx_local =
dsdx[ijk+
ele];
124 const T dsdy_local =
dsdy[ijk+
ele];
125 const T dsdz_local =
dsdz[ijk+
ele];
126 const T dtdx_local =
dtdx[ijk+
ele];
127 const T dtdy_local =
dtdy[ijk+
ele];
128 const T dtdz_local =
dtdz[ijk+
ele];
129 const T dj =
h1[ijk+
ele] *
139 for (
int l = 0; l < LX; l++){
140 uttmp +=
shdz[k+l*LX] *
ru[l];
141 vttmp +=
shdz[k+l*LX] *
rv[l];
142 wttmp +=
shdz[k+l*LX] *
rw[l];
155 for (
int l = 0; l < LX; l++){
176 u1 = urtmp * drdx_local +
179 u2 = urtmp * drdy_local +
182 u3 = urtmp * drdz_local +
186 v1 = vrtmp * drdx_local +
189 v2 = vrtmp * drdy_local +
192 v3 = vrtmp * drdz_local +
196 w1 = wrtmp * drdx_local +
199 w2 = wrtmp * drdy_local +
202 w3 = wrtmp * drdz_local +
226 shur[
ij] = drdx_local * s11 +
229 shus[
ij] = dsdx_local * s11 +
232 rut = dtdx_local * s11 +
236 shvr[
ij] = drdx_local * s21 +
239 shvs[
ij] = dsdx_local * s21 +
242 rvt = dtdx_local * s21 +
246 shwr[
ij] = drdx_local * s31 +
249 shws[
ij] = dsdx_local * s31 +
252 rwt = dtdx_local * s31 +
262 for (
int l = 0; l < LX; l++){
280 for (
int k = 0; k < LX; ++k){
287 template<
typename T, const
int LX >
289 ax_helm_stress_kernel_vector_kstep_padded(T * __restrict__ au,
292 const T * __restrict__
u,
293 const T * __restrict__
v,
294 const T * __restrict__
w,
295 const T * __restrict__
dx,
296 const T * __restrict__
dy,
297 const T * __restrict__
dz,
298 const T * __restrict__
h1,
299 const T * __restrict__
drdx,
300 const T * __restrict__
drdy,
301 const T * __restrict__
drdz,
302 const T * __restrict__
dsdx,
303 const T * __restrict__
dsdy,
304 const T * __restrict__
dsdz,
305 const T * __restrict__
dtdx,
306 const T * __restrict__
dtdy,
307 const T * __restrict__
dtdz,
308 const T * __restrict__
jacinv,
309 const T * __restrict__
weight3) {
311 __shared__ T
shdx[LX * (LX+1)];
312 __shared__ T
shdy[LX * (LX+1)];
313 __shared__ T
shdz[LX * (LX+1)];
315 __shared__ T
shu[LX * (LX+1)];
316 __shared__ T
shur[LX * LX];
317 __shared__ T
shus[LX * (LX+1)];
319 __shared__ T
shv[LX * (LX+1)];
320 __shared__ T
shvr[LX * LX];
321 __shared__ T
shvs[LX * (LX+1)];
323 __shared__ T
shw[LX * (LX+1)];
324 __shared__ T
shwr[LX * LX];
325 __shared__ T
shws[LX * (LX+1)];
339 const int e = blockIdx.x;
340 const int j = threadIdx.y;
341 const int i = threadIdx.x;
342 const int ij =
i +
j*LX;
344 const int ele =
e*LX*LX*LX;
351 for(
int k = 0; k < LX; ++k){
365 for (
int k = 0; k < LX; ++k){
366 const int ijk =
ij + k*LX*LX;
367 const T drdx_local =
drdx[ijk+
ele];
368 const T drdy_local =
drdy[ijk+
ele];
369 const T drdz_local =
drdz[ijk+
ele];
370 const T dsdx_local =
dsdx[ijk+
ele];
371 const T dsdy_local =
dsdy[ijk+
ele];
372 const T dsdz_local =
dsdz[ijk+
ele];
373 const T dtdx_local =
dtdx[ijk+
ele];
374 const T dtdy_local =
dtdy[ijk+
ele];
375 const T dtdz_local =
dtdz[ijk+
ele];
376 const T dj =
h1[ijk+
ele] *
386 for (
int l = 0; l < LX; l++){
387 uttmp +=
shdz[k+l*(LX+1)] *
ru[l];
388 vttmp +=
shdz[k+l*(LX+1)] *
rv[l];
389 wttmp +=
shdz[k+l*(LX+1)] *
rw[l];
402 for (
int l = 0; l < LX; l++){
403 urtmp +=
shdx[
i+l*(LX+1)] *
shu[l+
j*(LX+1)];
404 ustmp +=
shdy[
j+l*(LX+1)] *
shu[
i+l*(LX+1)];
406 vrtmp +=
shdx[
i+l*(LX+1)] *
shv[l+
j*(LX+1)];
407 vstmp +=
shdy[
j+l*(LX+1)] *
shv[
i+l*(LX+1)];
409 wrtmp +=
shdx[
i+l*(LX+1)] *
shw[l+
j*(LX+1)];
410 wstmp +=
shdy[
j+l*(LX+1)] *
shw[
i+l*(LX+1)];
423 u1 = urtmp * drdx_local +
426 u2 = urtmp * drdy_local +
429 u3 = urtmp * drdz_local +
433 v1 = vrtmp * drdx_local +
436 v2 = vrtmp * drdy_local +
439 v3 = vrtmp * drdz_local +
443 w1 = wrtmp * drdx_local +
446 w2 = wrtmp * drdy_local +
449 w3 = wrtmp * drdz_local +
473 shur[
ij] = drdx_local * s11 +
479 rut = dtdx_local * s11 +
483 shvr[
ij] = drdx_local * s21 +
489 rvt = dtdx_local * s21 +
493 shwr[
ij] = drdx_local * s31 +
499 rwt = dtdx_local * s31 +
509 for (
int l = 0; l < LX; l++){
512 uwijke +=
shus[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
516 vwijke +=
shvs[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
520 wwijke +=
shws[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
527 for (
int k = 0; k < LX; ++k){
534 template<
typename T >
538 const T * __restrict__
u,
539 const T * __restrict__
v,
540 const T * __restrict__
w,
541 const T * __restrict__ h2,
542 const T * __restrict__ B,
545 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
546 const int str = blockDim.x * gridDim.x;
548 for (
int i = idx;
i < n;
i += str) {
549 au[
i] = au[
i] + h2[
i] * B[
i] *
u[
i];
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ weight3
__shared__ T shdy[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_stress_kernel_vector_kstep(T *__restrict__ au
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdz[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shvs[LX *LX]
__shared__ T shws[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void ax_helm_stress_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__shared__ T shwr[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__shared__ T shus[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__shared__ T shvr[LX *LX]
__shared__ T shur[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3