Neko  0.8.1
A portable framework for high-order spectral element flow simulations
ax_helm_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_AX_HELM_KERNEL_H__
2 #define __MATH_AX_HELM_KERNEL_H__
3 /*
4  Copyright (c) 2021-2022, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX, const int CHUNKS >
42 __global__ void ax_helm_kernel_1d(T * __restrict__ w,
43  const T * __restrict__ u,
44  const T * __restrict__ dx,
45  const T * __restrict__ dy,
46  const T * __restrict__ dz,
47  const T * __restrict__ dxt,
48  const T * __restrict__ dyt,
49  const T * __restrict__ dzt,
50  const T * __restrict__ h1,
51  const T * __restrict__ g11,
52  const T * __restrict__ g22,
53  const T * __restrict__ g33,
54  const T * __restrict__ g12,
55  const T * __restrict__ g13,
56  const T * __restrict__ g23) {
57 
58  __shared__ T shdx[LX*LX];
59  __shared__ T shdy[LX*LX];
60  __shared__ T shdzt[LX*LX];
61 
62  __shared__ T shdxt[LX*LX];
63  __shared__ T shdyt[LX*LX];
64  __shared__ T shdz[LX*LX];
65 
66  __shared__ T shu[LX*LX*LX];
67  __shared__ T shur[LX*LX*LX];
68  __shared__ T shus[LX*LX*LX];
69  __shared__ T shut[LX*LX*LX];
70 
71  const int e = blockIdx.x;
72  const int iii = threadIdx.x;
73  const int nchunks = (LX * LX * LX - 1)/CHUNKS + 1;
74 
75  if (iii<LX*LX) {
76  shdx[iii] = dx[iii];
77  shdy[iii] = dy[iii];
78  shdz[iii] = dz[iii];
79  }
80 
81  {
82  int i = iii;
83  while (i < LX * LX * LX){
84  shu[i] = u[i+e*LX*LX*LX];
85  i = i + CHUNKS;
86  }
87  }
88 
89  __syncthreads();
90 
91  if (iii<LX*LX){
92  shdxt[iii] = dxt[iii];
93  shdyt[iii] = dyt[iii];
94  shdzt[iii] = dzt[iii];
95  }
96 
97  for (int n=0; n<nchunks; n++){
98  const int ijk = iii+n*CHUNKS;
99  const int jk = ijk/LX;
100  const int i = ijk-jk*LX;
101  const int k = jk/LX;
102  const int j = jk-k*LX;
103  if (i<LX && j<LX && k<LX && ijk < LX*LX*LX){
104  const T G00 = g11[ijk+e*LX*LX*LX];
105  const T G11 = g22[ijk+e*LX*LX*LX];
106  const T G22 = g33[ijk+e*LX*LX*LX];
107  const T G01 = g12[ijk+e*LX*LX*LX];
108  const T G02 = g13[ijk+e*LX*LX*LX];
109  const T G12 = g23[ijk+e*LX*LX*LX];
110  const T H1 = h1[ijk+e*LX*LX*LX];
111  T rtmp = 0.0;
112  T stmp = 0.0;
113  T ttmp = 0.0;
114 #pragma unroll
115  for (int l = 0; l<LX; l++){
116  rtmp = rtmp + shdx[i+l*LX] * shu[l+j*LX+k*LX*LX];
117  stmp = stmp + shdy[j+l*LX] * shu[i+l*LX+k*LX*LX];
118  ttmp = ttmp + shdz[k+l*LX] * shu[i+j*LX+l*LX*LX];
119  }
120  shur[ijk] = H1 * (G00 * rtmp + G01 * stmp + G02 * ttmp);
121  shus[ijk] = H1 * (G01 * rtmp + G11 * stmp + G12 * ttmp);
122  shut[ijk] = H1 * (G02 * rtmp + G12 * stmp + G22 * ttmp);
123  }
124  }
125 
126  __syncthreads();
127 
128  for (int n=0; n<nchunks; n++){
129  const int ijk = iii+n*CHUNKS;
130  const int jk = ijk/LX;
131  const int k = jk/LX;
132  const int j = jk-k*LX;
133  const int i = ijk-jk*LX;
134  if (i<LX && j<LX && k<LX && ijk <LX*LX*LX){
135  T wijke = 0.0;
136 #pragma unroll
137  for (int l = 0; l<LX; l++){
138  wijke = wijke
139  + shdxt[i+l*LX] * shur[l+j*LX+k*LX*LX]
140  + shdyt[j+l*LX] * shus[i+l*LX+k*LX*LX]
141  + shdzt[k+l*LX] * shut[i+j*LX+l*LX*LX];
142  }
143  w[ijk+e*LX*LX*LX] = wijke;
144  }
145  }
146 }
147 
148 template< typename T, const int LX >
149 __global__ void __launch_bounds__(LX*LX,3)
150  ax_helm_kernel_kstep(T * __restrict__ w,
151  const T * __restrict__ u,
152  const T * __restrict__ dx,
153  const T * __restrict__ dy,
154  const T * __restrict__ dz,
155  const T * __restrict__ h1,
156  const T * __restrict__ g11,
157  const T * __restrict__ g22,
158  const T * __restrict__ g33,
159  const T * __restrict__ g12,
160  const T * __restrict__ g13,
161  const T * __restrict__ g23) {
162 
163  __shared__ T shdx[LX * LX];
164  __shared__ T shdy[LX * LX];
165  __shared__ T shdz[LX * LX];
166 
167  __shared__ T shu[LX * LX];
168  __shared__ T shur[LX * LX];
169  __shared__ T shus[LX * LX];
170 
171  T ru[LX];
172  T rw[LX];
173  T rut;
174 
175  const int e = blockIdx.x;
176  const int j = threadIdx.y;
177  const int i = threadIdx.x;
178  const int ij = i + j*LX;
179  const int ele = e*LX*LX*LX;
180 
181  shdx[ij] = dx[ij];
182  shdy[ij] = dy[ij];
183  shdz[ij] = dz[ij];
184 
185 #pragma unroll
186  for(int k = 0; k < LX; ++k){
187  ru[k] = u[ij + k*LX*LX + ele];
188  rw[k] = 0.0;
189  }
190 
191 
193 #pragma unroll
194  for (int k = 0; k < LX; ++k){
195  const int ijk = ij + k*LX*LX;
196  const T G00 = g11[ijk+ele];
197  const T G11 = g22[ijk+ele];
198  const T G22 = g33[ijk+ele];
199  const T G01 = g12[ijk+ele];
200  const T G02 = g13[ijk+ele];
201  const T G12 = g23[ijk+ele];
202  const T H1 = h1[ijk+ele];
203  T ttmp = 0.0;
204  shu[ij] = ru[k];
205  for (int l = 0; l < LX; l++){
206  ttmp += shdz[k+l*LX] * ru[l];
207  }
208  __syncthreads();
209 
210  T rtmp = 0.0;
211  T stmp = 0.0;
212 #pragma unroll
213  for (int l = 0; l < LX; l++){
214  rtmp += shdx[i+l*LX] * shu[l+j*LX];
215  stmp += shdy[j+l*LX] * shu[i+l*LX];
216  }
217  shur[ij] = H1
218  * (G00 * rtmp
219  + G01 * stmp
220  + G02 * ttmp);
221  shus[ij] = H1
222  * (G01 * rtmp
223  + G11 * stmp
224  + G12 * ttmp);
225  rut = H1
226  * (G02 * rtmp
227  + G12 * stmp
228  + G22 * ttmp);
229 
230  __syncthreads();
231 
232  T wijke = 0.0;
233 #pragma unroll
234  for (int l = 0; l < LX; l++){
235  wijke += shur[l+j*LX] * shdx[l+i*LX];
236  rw[l] += rut * shdz[k+l*LX];
237  wijke += shus[i+l*LX] * shdy[l + j*LX];
238  }
239  rw[k] += wijke;
240  }
241 #pragma unroll
242  for (int k = 0; k < LX; ++k){
243  w[ij + k*LX*LX + ele] = rw[k];
244  }
245 }
246 
252 template< typename T, const int LX >
253 __global__ void __launch_bounds__(LX*LX,3)
254  ax_helm_kernel_kstep_padded(T * __restrict__ w,
255  const T * __restrict__ u,
256  const T * __restrict__ dx,
257  const T * __restrict__ dy,
258  const T * __restrict__ dz,
259  const T * __restrict__ h1,
260  const T * __restrict__ g11,
261  const T * __restrict__ g22,
262  const T * __restrict__ g33,
263  const T * __restrict__ g12,
264  const T * __restrict__ g13,
265  const T * __restrict__ g23) {
266 
267  __shared__ T shdx[LX * (LX+1)];
268  __shared__ T shdy[LX * (LX+1)];
269  __shared__ T shdz[LX * (LX+1)];
270 
271  __shared__ T shu[LX * (LX+1)];
272  __shared__ T shur[LX * LX]; // only accessed using fastest dimension
273  __shared__ T shus[LX * (LX+1)];
274 
275  T ru[LX];
276  T rw[LX];
277  T rut;
278 
279  const int e = blockIdx.x;
280  const int j = threadIdx.y;
281  const int i = threadIdx.x;
282  const int ij = i + j*LX;
283  const int ij_p = i + j*(LX+1);
284  const int ele = e*LX*LX*LX;
285 
286  shdx[ij_p] = dx[ij];
287  shdy[ij_p] = dy[ij];
288  shdz[ij_p] = dz[ij];
289 
290 #pragma unroll
291  for(int k = 0; k < LX; ++k){
292  ru[k] = u[ij + k*LX*LX + ele];
293  rw[k] = 0.0;
294  }
295 
296 
297  __syncthreads();
298 #pragma unroll
299  for (int k = 0; k < LX; ++k){
300  const int ijk = ij + k*LX*LX;
301  const T G00 = g11[ijk+ele];
302  const T G11 = g22[ijk+ele];
303  const T G22 = g33[ijk+ele];
304  const T G01 = g12[ijk+ele];
305  const T G02 = g13[ijk+ele];
306  const T G12 = g23[ijk+ele];
307  const T H1 = h1[ijk+ele];
308  T ttmp = 0.0;
309  shu[ij_p] = ru[k];
310  for (int l = 0; l < LX; l++){
311  ttmp += shdz[k+l*(LX+1)] * ru[l];
312  }
313  __syncthreads();
314 
315  T rtmp = 0.0;
316  T stmp = 0.0;
317 #pragma unroll
318  for (int l = 0; l < LX; l++){
319  rtmp += shdx[i+l*(LX+1)] * shu[l+j*(LX+1)];
320  stmp += shdy[j+l*(LX+1)] * shu[i+l*(LX+1)];
321  }
322  shur[ij] = H1
323  * (G00 * rtmp
324  + G01 * stmp
325  + G02 * ttmp);
326  shus[ij_p] = H1
327  * (G01 * rtmp
328  + G11 * stmp
329  + G12 * ttmp);
330  rut = H1
331  * (G02 * rtmp
332  + G12 * stmp
333  + G22 * ttmp);
334 
335  __syncthreads();
336 
337  T wijke = 0.0;
338 #pragma unroll
339  for (int l = 0; l < LX; l++){
340  wijke += shur[l+j*LX] * shdx[l+i*(LX+1)];
341  rw[l] += rut * shdz[k+l*(LX+1)];
342  wijke += shus[i+l*(LX+1)] * shdy[l + j*(LX+1)];
343  }
344  rw[k] += wijke;
345  }
346 #pragma unroll
347  for (int k = 0; k < LX; ++k){
348  w[ij + k*LX*LX + ele] = rw[k];
349  }
350 }
351 
352 #endif // __MATH_AX_HELM_KERNEL_H__
__global__ void ax_helm_kernel_1d(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep_padded(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__shared__ T shdyt[LX *LX]
Definition: cdtp_kernel.h:120
__shared__ T shdzt[LX *LX]
Definition: cdtp_kernel.h:121
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:115
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:114
shdxt[ij]
Definition: cdtp_kernel.h:136
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:113
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shu[LX *LX]
const int ij_p
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__shared__ T shdz[LX *LX]
T ru[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
const int i
const int ij
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
shdx[ij]
T rw[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
T rut
const int e
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
__global__ void const T *__restrict__ const T *__restrict__ dx
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_kernel_kstep(T *__restrict__ w
const int ele
__global__ void const T *__restrict__ u
const int j
__shared__ T shus[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__syncthreads()
__shared__ T shur[LX *LX]