1 #ifndef __MATH_AX_HELM_KERNEL_H__
2 #define __MATH_AX_HELM_KERNEL_H__
41 template<
typename T, const
int LX, const
int CHUNKS >
43 const T * __restrict__
u,
44 const T * __restrict__
dx,
45 const T * __restrict__
dy,
46 const T * __restrict__
dz,
47 const T * __restrict__
dxt,
48 const T * __restrict__
dyt,
49 const T * __restrict__
dzt,
50 const T * __restrict__
h1,
51 const T * __restrict__
g11,
52 const T * __restrict__
g22,
53 const T * __restrict__
g33,
54 const T * __restrict__
g12,
55 const T * __restrict__
g13,
56 const T * __restrict__
g23) {
58 __shared__ T
shdx[LX*LX];
59 __shared__ T
shdy[LX*LX];
60 __shared__ T
shdzt[LX*LX];
62 __shared__ T
shdxt[LX*LX];
63 __shared__ T
shdyt[LX*LX];
64 __shared__ T
shdz[LX*LX];
66 __shared__ T
shu[LX*LX*LX];
67 __shared__ T
shur[LX*LX*LX];
68 __shared__ T
shus[LX*LX*LX];
69 __shared__ T shut[LX*LX*LX];
71 const int e = blockIdx.x;
72 const int iii = threadIdx.x;
73 const int nchunks = (LX * LX * LX - 1)/CHUNKS + 1;
83 while (
i < LX * LX * LX){
97 for (
int n=0; n<nchunks; n++){
98 const int ijk = iii+n*CHUNKS;
99 const int jk = ijk/LX;
100 const int i = ijk-jk*LX;
102 const int j = jk-k*LX;
103 if (
i<LX &&
j<LX && k<LX && ijk < LX*LX*LX){
104 const T G00 =
g11[ijk+
e*LX*LX*LX];
105 const T G11 =
g22[ijk+
e*LX*LX*LX];
106 const T G22 =
g33[ijk+
e*LX*LX*LX];
107 const T G01 =
g12[ijk+
e*LX*LX*LX];
108 const T G02 =
g13[ijk+
e*LX*LX*LX];
109 const T G12 =
g23[ijk+
e*LX*LX*LX];
110 const T H1 =
h1[ijk+
e*LX*LX*LX];
115 for (
int l = 0; l<LX; l++){
116 rtmp = rtmp +
shdx[
i+l*LX] *
shu[l+
j*LX+k*LX*LX];
117 stmp = stmp +
shdy[
j+l*LX] *
shu[
i+l*LX+k*LX*LX];
118 ttmp = ttmp +
shdz[k+l*LX] *
shu[
i+
j*LX+l*LX*LX];
120 shur[ijk] = H1 * (G00 * rtmp + G01 * stmp + G02 * ttmp);
121 shus[ijk] = H1 * (G01 * rtmp + G11 * stmp + G12 * ttmp);
122 shut[ijk] = H1 * (G02 * rtmp + G12 * stmp + G22 * ttmp);
128 for (
int n=0; n<nchunks; n++){
129 const int ijk = iii+n*CHUNKS;
130 const int jk = ijk/LX;
132 const int j = jk-k*LX;
133 const int i = ijk-jk*LX;
134 if (
i<LX &&
j<LX && k<LX && ijk <LX*LX*LX){
137 for (
int l = 0; l<LX; l++){
141 +
shdzt[k+l*LX] * shut[
i+
j*LX+l*LX*LX];
143 w[ijk+
e*LX*LX*LX] = wijke;
148 template<
typename T, const
int LX >
150 const T * __restrict__
u,
151 const T * __restrict__
dx,
152 const T * __restrict__
dy,
153 const T * __restrict__
dz,
154 const T * __restrict__
h1,
155 const T * __restrict__
g11,
156 const T * __restrict__
g22,
157 const T * __restrict__
g33,
158 const T * __restrict__
g12,
159 const T * __restrict__
g13,
160 const T * __restrict__
g23) {
162 __shared__ T
shdx[LX * LX];
163 __shared__ T
shdy[LX * LX];
164 __shared__ T
shdz[LX * LX];
166 __shared__ T
shu[LX * LX];
167 __shared__ T
shur[LX * LX];
168 __shared__ T
shus[LX * LX];
174 const int e = blockIdx.x;
175 const int j = threadIdx.y;
176 const int i = threadIdx.x;
177 const int ij =
i +
j*LX;
178 const int ele =
e*LX*LX*LX;
185 for(
int k = 0; k < LX; ++k){
193 for (
int k = 0; k < LX; ++k){
194 const int ijk =
ij + k*LX*LX;
195 const T G00 =
g11[ijk+
ele];
196 const T G11 =
g22[ijk+
ele];
197 const T G22 =
g33[ijk+
ele];
198 const T G01 =
g12[ijk+
ele];
199 const T G02 =
g13[ijk+
ele];
200 const T G12 =
g23[ijk+
ele];
201 const T H1 =
h1[ijk+
ele];
204 for (
int l = 0; l < LX; l++){
205 ttmp +=
shdz[k+l*LX] *
ru[l];
212 for (
int l = 0; l < LX; l++){
233 for (
int l = 0; l < LX; l++){
241 for (
int k = 0; k < LX; ++k){
251 template<
typename T, const
int LX >
253 const T * __restrict__
u,
254 const T * __restrict__
dx,
255 const T * __restrict__
dy,
256 const T * __restrict__
dz,
257 const T * __restrict__
h1,
258 const T * __restrict__
g11,
259 const T * __restrict__
g22,
260 const T * __restrict__
g33,
261 const T * __restrict__
g12,
262 const T * __restrict__
g13,
263 const T * __restrict__
g23) {
265 __shared__ T
shdx[LX * (LX+1)];
266 __shared__ T
shdy[LX * (LX+1)];
267 __shared__ T
shdz[LX * (LX+1)];
269 __shared__ T
shu[LX * (LX+1)];
270 __shared__ T
shur[LX * LX];
271 __shared__ T
shus[LX * (LX+1)];
277 const int e = blockIdx.x;
278 const int j = threadIdx.y;
279 const int i = threadIdx.x;
280 const int ij =
i +
j*LX;
281 const int ij_p =
i +
j*(LX+1);
282 const int ele =
e*LX*LX*LX;
289 for(
int k = 0; k < LX; ++k){
297 for (
int k = 0; k < LX; ++k){
298 const int ijk =
ij + k*LX*LX;
299 const T G00 =
g11[ijk+
ele];
300 const T G11 =
g22[ijk+
ele];
301 const T G22 =
g33[ijk+
ele];
302 const T G01 =
g12[ijk+
ele];
303 const T G02 =
g13[ijk+
ele];
304 const T G12 =
g23[ijk+
ele];
305 const T H1 =
h1[ijk+
ele];
308 for (
int l = 0; l < LX; l++){
309 ttmp +=
shdz[k+l*(LX+1)] *
ru[l];
316 for (
int l = 0; l < LX; l++){
317 rtmp +=
shdx[
i+l*(LX+1)] *
shu[l+
j*(LX+1)];
318 stmp +=
shdy[
j+l*(LX+1)] *
shu[
i+l*(LX+1)];
337 for (
int l = 0; l < LX; l++){
340 wijke +=
shus[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
345 for (
int k = 0; k < LX; ++k){
354 template<
typename T, const
int LX >
358 const T * __restrict__
u,
359 const T * __restrict__
v,
360 const T * __restrict__
w,
361 const T * __restrict__
dx,
362 const T * __restrict__
dy,
363 const T * __restrict__
dz,
364 const T * __restrict__
h1,
365 const T * __restrict__
g11,
366 const T * __restrict__
g22,
367 const T * __restrict__
g33,
368 const T * __restrict__
g12,
369 const T * __restrict__
g13,
370 const T * __restrict__
g23) {
372 __shared__ T
shdx[LX * LX];
373 __shared__ T
shdy[LX * LX];
374 __shared__ T
shdz[LX * LX];
376 __shared__ T
shu[LX * LX];
377 __shared__ T
shur[LX * LX];
378 __shared__ T
shus[LX * LX];
380 __shared__ T
shv[LX * LX];
381 __shared__ T
shvr[LX * LX];
382 __shared__ T
shvs[LX * LX];
384 __shared__ T
shw[LX * LX];
385 __shared__ T
shwr[LX * LX];
386 __shared__ T
shws[LX * LX];
400 const int e = blockIdx.x;
401 const int j = threadIdx.y;
402 const int i = threadIdx.x;
403 const int ij =
i +
j*LX;
404 const int ele =
e*LX*LX*LX;
411 for(
int k = 0; k < LX; ++k){
425 for (
int k = 0; k < LX; ++k){
426 const int ijk =
ij + k*LX*LX;
427 const T G00 =
g11[ijk+
ele];
428 const T G11 =
g22[ijk+
ele];
429 const T G22 =
g33[ijk+
ele];
430 const T G01 =
g12[ijk+
ele];
431 const T G02 =
g13[ijk+
ele];
432 const T G12 =
g23[ijk+
ele];
433 const T H1 =
h1[ijk+
ele];
440 for (
int l = 0; l < LX; l++){
441 uttmp +=
shdz[k+l*LX] *
ru[l];
442 vttmp +=
shdz[k+l*LX] *
rv[l];
443 wttmp +=
shdz[k+l*LX] *
rw[l];
456 for (
int l = 0; l < LX; l++){
512 for (
int l = 0; l < LX; l++){
530 for (
int k = 0; k < LX; ++k){
537 template<
typename T, const
int LX >
541 const T * __restrict__
u,
542 const T * __restrict__
v,
543 const T * __restrict__
w,
544 const T * __restrict__
dx,
545 const T * __restrict__
dy,
546 const T * __restrict__
dz,
547 const T * __restrict__
h1,
548 const T * __restrict__
g11,
549 const T * __restrict__
g22,
550 const T * __restrict__
g33,
551 const T * __restrict__
g12,
552 const T * __restrict__
g13,
553 const T * __restrict__
g23) {
555 __shared__ T
shdx[LX * (LX+1)];
556 __shared__ T
shdy[LX * (LX+1)];
557 __shared__ T
shdz[LX * (LX+1)];
559 __shared__ T
shu[LX * (LX+1)];
560 __shared__ T
shur[LX * LX];
561 __shared__ T
shus[LX * (LX+1)];
563 __shared__ T
shv[LX * (LX+1)];
564 __shared__ T
shvr[LX * LX];
565 __shared__ T
shvs[LX * (LX+1)];
567 __shared__ T
shw[LX * (LX+1)];
568 __shared__ T
shwr[LX * LX];
569 __shared__ T
shws[LX * (LX+1)];
583 const int e = blockIdx.x;
584 const int j = threadIdx.y;
585 const int i = threadIdx.x;
586 const int ij =
i +
j*LX;
587 const int ij_p =
i +
j*(LX+1);
588 const int ele =
e*LX*LX*LX;
595 for(
int k = 0; k < LX; ++k){
609 for (
int k = 0; k < LX; ++k){
610 const int ijk =
ij + k*LX*LX;
611 const T G00 =
g11[ijk+
ele];
612 const T G11 =
g22[ijk+
ele];
613 const T G22 =
g33[ijk+
ele];
614 const T G01 =
g12[ijk+
ele];
615 const T G02 =
g13[ijk+
ele];
616 const T G12 =
g23[ijk+
ele];
617 const T H1 =
h1[ijk+
ele];
624 for (
int l = 0; l < LX; l++){
625 uttmp +=
shdz[k+l*(LX+1)] *
ru[l];
626 vttmp +=
shdz[k+l*(LX+1)] *
rv[l];
627 wttmp +=
shdz[k+l*(LX+1)] *
rw[l];
640 for (
int l = 0; l < LX; l++){
641 urtmp +=
shdx[
i+l*(LX+1)] *
shu[l+
j*(LX+1)];
642 ustmp +=
shdy[
j+l*(LX+1)] *
shu[
i+l*(LX+1)];
644 vrtmp +=
shdx[
i+l*(LX+1)] *
shv[l+
j*(LX+1)];
645 vstmp +=
shdy[
j+l*(LX+1)] *
shv[
i+l*(LX+1)];
647 wrtmp +=
shdx[
i+l*(LX+1)] *
shw[l+
j*(LX+1)];
648 wstmp +=
shdy[
j+l*(LX+1)] *
shw[
i+l*(LX+1)];
696 for (
int l = 0; l < LX; l++){
699 uwijke +=
shus[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
703 vwijke +=
shvs[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
707 wwijke +=
shws[
i+l*(LX+1)] *
shdy[l +
j*(LX+1)];
714 for (
int k = 0; k < LX; ++k){
721 template<
typename T >
725 const T * __restrict__
u,
726 const T * __restrict__
v,
727 const T * __restrict__
w,
728 const T * __restrict__ h2,
729 const T * __restrict__ B,
732 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
733 const int str = blockDim.x * gridDim.x;
735 for (
int i = idx;
i < n;
i += str) {
736 au[
i] = au[
i] + h2[
i] * B[
i] *
u[
i];
__global__ void ax_helm_kernel_vector_kstep(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_1d(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep_padded(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_vector_kstep_padded(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__shared__ T shdyt[LX *LX]
__shared__ T shdzt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void T *__restrict__ av
__shared__ T shus[LX *LX]
__shared__ T shdz[LX *LX]
__shared__ T shvs[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__shared__ T shvr[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__shared__ T shur[LX *LX]
__shared__ T shws[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ dx
__shared__ T shwr[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v