1 #ifndef __MATH_AX_HELM_KERNEL_H__
2 #define __MATH_AX_HELM_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
u,
44 const T * __restrict__
dx,
45 const T * __restrict__
dy,
46 const T * __restrict__
dz,
47 const T * __restrict__
dxt,
48 const T * __restrict__
dyt,
49 const T * __restrict__
dzt,
50 const T * __restrict__
h1,
51 const T * __restrict__
g11,
52 const T * __restrict__
g22,
53 const T * __restrict__
g33,
54 const T * __restrict__
g12,
55 const T * __restrict__
g13,
56 const T * __restrict__
g23) {
58 __shared__ T
shdx[LX*LX];
59 __shared__ T
shdy[LX*LX];
60 __shared__ T
shdzt[LX*LX];
62 __shared__ T
shdxt[LX*LX];
63 __shared__ T
shdyt[LX*LX];
64 __shared__ T
shdz[LX*LX];
66 __shared__ T
shu[LX*LX*LX];
67 __shared__ T
shur[LX*LX*LX];
68 __shared__ T
shus[LX*LX*LX];
69 __shared__ T shut[LX*LX*LX];
71 const int e = blockIdx.x;
72 const int iii = threadIdx.x;
73 const int nchunks = (LX * LX * LX - 1)/CHUNKS + 1;
83 while (
i < LX * LX * LX){
97 for (
int n=0; n<nchunks; n++){
98 const int ijk = iii+n*CHUNKS;
99 const int jk = ijk/LX;
100 const int i = ijk-jk*LX;
102 const int j = jk-k*LX;
103 if (
i<LX &&
j<LX && k<LX && ijk < LX*LX*LX){
104 const T G00 =
g11[ijk+
e*LX*LX*LX];
105 const T G11 =
g22[ijk+
e*LX*LX*LX];
106 const T G22 =
g33[ijk+
e*LX*LX*LX];
107 const T G01 =
g12[ijk+
e*LX*LX*LX];
108 const T G02 =
g13[ijk+
e*LX*LX*LX];
109 const T G12 =
g23[ijk+
e*LX*LX*LX];
110 const T H1 =
h1[ijk+
e*LX*LX*LX];
115 for (
int l = 0; l<LX; l++){
116 rtmp = rtmp +
shdx[
i+l*LX] *
shu[l+
j*LX+k*LX*LX];
117 stmp = stmp +
shdy[
j+l*LX] *
shu[
i+l*LX+k*LX*LX];
118 ttmp = ttmp +
shdz[k+l*LX] *
shu[
i+
j*LX+l*LX*LX];
120 shur[ijk] = H1 * (G00 * rtmp + G01 * stmp + G02 * ttmp);
121 shus[ijk] = H1 * (G01 * rtmp + G11 * stmp + G12 * ttmp);
122 shut[ijk] = H1 * (G02 * rtmp + G12 * stmp + G22 * ttmp);
128 for (
int n=0; n<nchunks; n++){
129 const int ijk = iii+n*CHUNKS;
130 const int jk = ijk/LX;
132 const int j = jk-k*LX;
133 const int i = ijk-jk*LX;
134 if (
i<LX &&
j<LX && k<LX && ijk <LX*LX*LX){
137 for (
int l = 0; l<LX; l++){
141 +
shdzt[k+l*LX] * shut[
i+
j*LX+l*LX*LX];
143 w[ijk+
e*LX*LX*LX] = wijke;
148 template<
typename T, const
int LX >
151 const T * __restrict__
u,
152 const T * __restrict__
dx,
153 const T * __restrict__
dy,
154 const T * __restrict__
dz,
155 const T * __restrict__
h1,
156 const T * __restrict__
g11,
157 const T * __restrict__
g22,
158 const T * __restrict__
g33,
159 const T * __restrict__
g12,
160 const T * __restrict__
g13,
161 const T * __restrict__
g23) {
163 __shared__ T
shdx[LX * LX];
167 __shared__ T
shu[LX * LX];
175 const int e = blockIdx.x;
176 const int j = threadIdx.y;
177 const int i = threadIdx.x;
186 for(
int k = 0; k < LX; ++k){
194 for (
int k = 0; k < LX; ++k){
195 const int ijk =
ij + k*LX*LX;
196 const T G00 =
g11[ijk+
ele];
197 const T G11 =
g22[ijk+
ele];
198 const T G22 =
g33[ijk+
ele];
199 const T G01 =
g12[ijk+
ele];
200 const T G02 =
g13[ijk+
ele];
201 const T G12 =
g23[ijk+
ele];
202 const T H1 =
h1[ijk+
ele];
205 for (
int l = 0; l < LX; l++){
206 ttmp +=
shdz[k+l*LX] *
ru[l];
213 for (
int l = 0; l < LX; l++){
234 for (
int l = 0; l < LX; l++){
242 for (
int k = 0; k < LX; ++k){
252 template<
typename T, const
int LX >
255 const T * __restrict__
u,
256 const T * __restrict__
dx,
257 const T * __restrict__
dy,
258 const T * __restrict__
dz,
259 const T * __restrict__
h1,
260 const T * __restrict__
g11,
261 const T * __restrict__
g22,
262 const T * __restrict__
g33,
263 const T * __restrict__
g12,
264 const T * __restrict__
g13,
265 const T * __restrict__
g23) {
267 __shared__ T
shdx[LX * (LX+1)];
268 __shared__ T
shdy[LX * (LX+1)];
269 __shared__ T
shdz[LX * (LX+1)];
271 __shared__ T
shu[LX * (LX+1)];
272 __shared__ T
shur[LX * LX];
273 __shared__ T
shus[LX * (LX+1)];
279 const int e = blockIdx.x;
280 const int j = threadIdx.y;
281 const int i = threadIdx.x;
282 const int ij =
i +
j*LX;
284 const int ele =
e*LX*LX*LX;
291 for(
int k = 0; k < LX; ++k){
299 for (
int k = 0; k < LX; ++k){
300 const int ijk =
ij + k*LX*LX;
301 const T G00 =
g11[ijk+
ele];
302 const T G11 =
g22[ijk+
ele];
303 const T G22 =
g33[ijk+
ele];
304 const T G01 =
g12[ijk+
ele];
305 const T G02 =
g13[ijk+
ele];
306 const T G12 =
g23[ijk+
ele];
307 const T H1 =
h1[ijk+
ele];
310 for (
int l = 0; l < LX; l++){
311 ttmp +=
shdz[k+l*(LX+1)] *
ru[l];
318 for (
int l = 0; l < LX; l++){
319 rtmp +=
shdx[
i+l*(LX+1)] *
shu[l+
j*(LX+1)];
320 stmp +=
shdy[
j+l*(LX+1)] *
shu[
i+l*(LX+1)];
339 for (
int l = 0; l < LX; l++){
342 wijke +=
shus[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
347 for (
int k = 0; k < LX; ++k){
356 template<
typename T, const
int LX >
361 const T * __restrict__
u,
362 const T * __restrict__
v,
363 const T * __restrict__
w,
364 const T * __restrict__
dx,
365 const T * __restrict__
dy,
366 const T * __restrict__
dz,
367 const T * __restrict__
h1,
368 const T * __restrict__
g11,
369 const T * __restrict__
g22,
370 const T * __restrict__
g33,
371 const T * __restrict__
g12,
372 const T * __restrict__
g13,
373 const T * __restrict__
g23) {
375 __shared__ T
shdx[LX * LX];
376 __shared__ T
shdy[LX * LX];
377 __shared__ T
shdz[LX * LX];
379 __shared__ T
shu[LX * LX];
380 __shared__ T
shur[LX * LX];
381 __shared__ T
shus[LX * LX];
383 __shared__ T
shv[LX * LX];
387 __shared__ T
shw[LX * LX];
403 const int e = blockIdx.x;
404 const int j = threadIdx.y;
405 const int i = threadIdx.x;
406 const int ij =
i +
j*LX;
407 const int ele =
e*LX*LX*LX;
414 for(
int k = 0; k < LX; ++k){
428 for (
int k = 0; k < LX; ++k){
429 const int ijk =
ij + k*LX*LX;
430 const T G00 =
g11[ijk+
ele];
431 const T G11 =
g22[ijk+
ele];
432 const T G22 =
g33[ijk+
ele];
433 const T G01 =
g12[ijk+
ele];
434 const T G02 =
g13[ijk+
ele];
435 const T G12 =
g23[ijk+
ele];
436 const T H1 =
h1[ijk+
ele];
443 for (
int l = 0; l < LX; l++){
444 uttmp +=
shdz[k+l*LX] *
ru[l];
445 vttmp +=
shdz[k+l*LX] *
rv[l];
446 wttmp +=
shdz[k+l*LX] *
rw[l];
459 for (
int l = 0; l < LX; l++){
515 for (
int l = 0; l < LX; l++){
533 for (
int k = 0; k < LX; ++k){
540 template<
typename T, const
int LX >
545 const T * __restrict__
u,
546 const T * __restrict__
v,
547 const T * __restrict__
w,
548 const T * __restrict__
dx,
549 const T * __restrict__
dy,
550 const T * __restrict__
dz,
551 const T * __restrict__
h1,
552 const T * __restrict__
g11,
553 const T * __restrict__
g22,
554 const T * __restrict__
g33,
555 const T * __restrict__
g12,
556 const T * __restrict__
g13,
557 const T * __restrict__
g23) {
559 __shared__ T
shdx[LX * (LX+1)];
560 __shared__ T
shdy[LX * (LX+1)];
561 __shared__ T
shdz[LX * (LX+1)];
563 __shared__ T
shu[LX * (LX+1)];
564 __shared__ T
shur[LX * LX];
565 __shared__ T
shus[LX * (LX+1)];
567 __shared__ T
shv[LX * (LX+1)];
568 __shared__ T
shvr[LX * LX];
569 __shared__ T
shvs[LX * (LX+1)];
571 __shared__ T
shw[LX * (LX+1)];
572 __shared__ T
shwr[LX * LX];
573 __shared__ T
shws[LX * (LX+1)];
587 const int e = blockIdx.x;
588 const int j = threadIdx.y;
589 const int i = threadIdx.x;
590 const int ij =
i +
j*LX;
591 const int ij_p =
i +
j*(LX+1);
592 const int ele =
e*LX*LX*LX;
599 for(
int k = 0; k < LX; ++k){
613 for (
int k = 0; k < LX; ++k){
614 const int ijk =
ij + k*LX*LX;
615 const T G00 =
g11[ijk+
ele];
616 const T G11 =
g22[ijk+
ele];
617 const T G22 =
g33[ijk+
ele];
618 const T G01 =
g12[ijk+
ele];
619 const T G02 =
g13[ijk+
ele];
620 const T G12 =
g23[ijk+
ele];
621 const T H1 =
h1[ijk+
ele];
628 for (
int l = 0; l < LX; l++){
629 uttmp +=
shdz[k+l*(LX+1)] *
ru[l];
630 vttmp +=
shdz[k+l*(LX+1)] *
rv[l];
631 wttmp +=
shdz[k+l*(LX+1)] *
rw[l];
644 for (
int l = 0; l < LX; l++){
645 urtmp +=
shdx[
i+l*(LX+1)] *
shu[l+
j*(LX+1)];
646 ustmp +=
shdy[
j+l*(LX+1)] *
shu[
i+l*(LX+1)];
648 vrtmp +=
shdx[
i+l*(LX+1)] *
shv[l+
j*(LX+1)];
649 vstmp +=
shdy[
j+l*(LX+1)] *
shv[
i+l*(LX+1)];
651 wrtmp +=
shdx[
i+l*(LX+1)] *
shw[l+
j*(LX+1)];
652 wstmp +=
shdy[
j+l*(LX+1)] *
shw[
i+l*(LX+1)];
700 for (
int l = 0; l < LX; l++){
703 uwijke +=
shus[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
707 vwijke +=
shvs[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
711 wwijke +=
shws[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
718 for (
int k = 0; k < LX; ++k){
725 template<
typename T >
729 const T * __restrict__
u,
730 const T * __restrict__
v,
731 const T * __restrict__
w,
732 const T * __restrict__ h2,
733 const T * __restrict__ B,
736 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
737 const int str = blockDim.x * gridDim.x;
739 for (
int i = idx;
i < n;
i += str) {
740 au[
i] = au[
i] + h2[
i] * B[
i] *
u[
i];
__global__ void ax_helm_kernel_vector_kstep(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_1d(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep_padded(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_vector_kstep_padded(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__shared__ T shdyt[LX *LX]
__shared__ T shdzt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void T *__restrict__ av
__shared__ T shus[LX *LX]
__shared__ T shdz[LX *LX]
__shared__ T shvs[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__shared__ T shvr[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__shared__ T shur[LX *LX]
__shared__ T shws[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_kernel_kstep(T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ dx
__shared__ T shwr[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v