Neko  0.8.1
A portable framework for high-order spectral element flow simulations
ax_helm_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_AX_HELM_KERNEL_H__
2 #define __MATH_AX_HELM_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX, const int CHUNKS >
42 __global__ void ax_helm_kernel_1d(T * __restrict__ w,
43  const T * __restrict__ u,
44  const T * __restrict__ dx,
45  const T * __restrict__ dy,
46  const T * __restrict__ dz,
47  const T * __restrict__ dxt,
48  const T * __restrict__ dyt,
49  const T * __restrict__ dzt,
50  const T * __restrict__ h1,
51  const T * __restrict__ g11,
52  const T * __restrict__ g22,
53  const T * __restrict__ g33,
54  const T * __restrict__ g12,
55  const T * __restrict__ g13,
56  const T * __restrict__ g23) {
57 
58  __shared__ T shdx[LX*LX];
59  __shared__ T shdy[LX*LX];
60  __shared__ T shdzt[LX*LX];
61 
62  __shared__ T shdxt[LX*LX];
63  __shared__ T shdyt[LX*LX];
64  __shared__ T shdz[LX*LX];
65 
66  __shared__ T shu[LX*LX*LX];
67  __shared__ T shur[LX*LX*LX];
68  __shared__ T shus[LX*LX*LX];
69  __shared__ T shut[LX*LX*LX];
70 
71  const int e = blockIdx.x;
72  const int iii = threadIdx.x;
73  const int nchunks = (LX * LX * LX - 1)/CHUNKS + 1;
74 
75  if (iii<LX*LX) {
76  shdx[iii] = dx[iii];
77  shdy[iii] = dy[iii];
78  shdz[iii] = dz[iii];
79  }
80 
81  {
82  int i = iii;
83  while (i < LX * LX * LX){
84  shu[i] = u[i+e*LX*LX*LX];
85  i = i + CHUNKS;
86  }
87  }
88 
89  __syncthreads();
90 
91  if (iii<LX*LX){
92  shdxt[iii] = dxt[iii];
93  shdyt[iii] = dyt[iii];
94  shdzt[iii] = dzt[iii];
95  }
96 
97  for (int n=0; n<nchunks; n++){
98  const int ijk = iii+n*CHUNKS;
99  const int jk = ijk/LX;
100  const int i = ijk-jk*LX;
101  const int k = jk/LX;
102  const int j = jk-k*LX;
103  if (i<LX && j<LX && k<LX && ijk < LX*LX*LX){
104  const T G00 = g11[ijk+e*LX*LX*LX];
105  const T G11 = g22[ijk+e*LX*LX*LX];
106  const T G22 = g33[ijk+e*LX*LX*LX];
107  const T G01 = g12[ijk+e*LX*LX*LX];
108  const T G02 = g13[ijk+e*LX*LX*LX];
109  const T G12 = g23[ijk+e*LX*LX*LX];
110  const T H1 = h1[ijk+e*LX*LX*LX];
111  T rtmp = 0.0;
112  T stmp = 0.0;
113  T ttmp = 0.0;
114 #pragma unroll
115  for (int l = 0; l<LX; l++){
116  rtmp = rtmp + shdx[i+l*LX] * shu[l+j*LX+k*LX*LX];
117  stmp = stmp + shdy[j+l*LX] * shu[i+l*LX+k*LX*LX];
118  ttmp = ttmp + shdz[k+l*LX] * shu[i+j*LX+l*LX*LX];
119  }
120  shur[ijk] = H1 * (G00 * rtmp + G01 * stmp + G02 * ttmp);
121  shus[ijk] = H1 * (G01 * rtmp + G11 * stmp + G12 * ttmp);
122  shut[ijk] = H1 * (G02 * rtmp + G12 * stmp + G22 * ttmp);
123  }
124  }
125 
126  __syncthreads();
127 
128  for (int n=0; n<nchunks; n++){
129  const int ijk = iii+n*CHUNKS;
130  const int jk = ijk/LX;
131  const int k = jk/LX;
132  const int j = jk-k*LX;
133  const int i = ijk-jk*LX;
134  if (i<LX && j<LX && k<LX && ijk <LX*LX*LX){
135  T wijke = 0.0;
136 #pragma unroll
137  for (int l = 0; l<LX; l++){
138  wijke = wijke
139  + shdxt[i+l*LX] * shur[l+j*LX+k*LX*LX]
140  + shdyt[j+l*LX] * shus[i+l*LX+k*LX*LX]
141  + shdzt[k+l*LX] * shut[i+j*LX+l*LX*LX];
142  }
143  w[ijk+e*LX*LX*LX] = wijke;
144  }
145  }
146 }
147 
148 template< typename T, const int LX >
149 __global__ void ax_helm_kernel_kstep(T * __restrict__ w,
150  const T * __restrict__ u,
151  const T * __restrict__ dx,
152  const T * __restrict__ dy,
153  const T * __restrict__ dz,
154  const T * __restrict__ h1,
155  const T * __restrict__ g11,
156  const T * __restrict__ g22,
157  const T * __restrict__ g33,
158  const T * __restrict__ g12,
159  const T * __restrict__ g13,
160  const T * __restrict__ g23) {
161 
162  __shared__ T shdx[LX * LX];
163  __shared__ T shdy[LX * LX];
164  __shared__ T shdz[LX * LX];
165 
166  __shared__ T shu[LX * LX];
167  __shared__ T shur[LX * LX];
168  __shared__ T shus[LX * LX];
169 
170  T ru[LX];
171  T rw[LX];
172  T rut;
173 
174  const int e = blockIdx.x;
175  const int j = threadIdx.y;
176  const int i = threadIdx.x;
177  const int ij = i + j*LX;
178  const int ele = e*LX*LX*LX;
179 
180  shdx[ij] = dx[ij];
181  shdy[ij] = dy[ij];
182  shdz[ij] = dz[ij];
183 
184 #pragma unroll
185  for(int k = 0; k < LX; ++k){
186  ru[k] = u[ij + k*LX*LX + ele];
187  rw[k] = 0.0;
188  }
189 
190 
191  __syncthreads();
192 #pragma unroll
193  for (int k = 0; k < LX; ++k){
194  const int ijk = ij + k*LX*LX;
195  const T G00 = g11[ijk+ele];
196  const T G11 = g22[ijk+ele];
197  const T G22 = g33[ijk+ele];
198  const T G01 = g12[ijk+ele];
199  const T G02 = g13[ijk+ele];
200  const T G12 = g23[ijk+ele];
201  const T H1 = h1[ijk+ele];
202  T ttmp = 0.0;
203  shu[ij] = ru[k];
204  for (int l = 0; l < LX; l++){
205  ttmp += shdz[k+l*LX] * ru[l];
206  }
207  __syncthreads();
208 
209  T rtmp = 0.0;
210  T stmp = 0.0;
211 #pragma unroll
212  for (int l = 0; l < LX; l++){
213  rtmp += shdx[i+l*LX] * shu[l+j*LX];
214  stmp += shdy[j+l*LX] * shu[i+l*LX];
215  }
216  shur[ij] = H1
217  * (G00 * rtmp
218  + G01 * stmp
219  + G02 * ttmp);
220  shus[ij] = H1
221  * (G01 * rtmp
222  + G11 * stmp
223  + G12 * ttmp);
224  rut = H1
225  * (G02 * rtmp
226  + G12 * stmp
227  + G22 * ttmp);
228 
229  __syncthreads();
230 
231  T wijke = 0.0;
232 #pragma unroll
233  for (int l = 0; l < LX; l++){
234  wijke += shur[l+j*LX] * shdx[l+i*LX];
235  rw[l] += rut * shdz[k+l*LX];
236  wijke += shus[i+l*LX] * shdy[l + j*LX];
237  }
238  rw[k] += wijke;
239  }
240 #pragma unroll
241  for (int k = 0; k < LX; ++k){
242  w[ij + k*LX*LX + ele] = rw[k];
243  }
244 }
245 
251 template< typename T, const int LX >
252 __global__ void ax_helm_kernel_kstep_padded(T * __restrict__ w,
253  const T * __restrict__ u,
254  const T * __restrict__ dx,
255  const T * __restrict__ dy,
256  const T * __restrict__ dz,
257  const T * __restrict__ h1,
258  const T * __restrict__ g11,
259  const T * __restrict__ g22,
260  const T * __restrict__ g33,
261  const T * __restrict__ g12,
262  const T * __restrict__ g13,
263  const T * __restrict__ g23) {
264 
265  __shared__ T shdx[LX * (LX+1)];
266  __shared__ T shdy[LX * (LX+1)];
267  __shared__ T shdz[LX * (LX+1)];
268 
269  __shared__ T shu[LX * (LX+1)];
270  __shared__ T shur[LX * LX]; // only accessed using fastest dimension
271  __shared__ T shus[LX * (LX+1)];
272 
273  T ru[LX];
274  T rw[LX];
275  T rut;
276 
277  const int e = blockIdx.x;
278  const int j = threadIdx.y;
279  const int i = threadIdx.x;
280  const int ij = i + j*LX;
281  const int ij_p = i + j*(LX+1);
282  const int ele = e*LX*LX*LX;
283 
284  shdx[ij_p] = dx[ij];
285  shdy[ij_p] = dy[ij];
286  shdz[ij_p] = dz[ij];
287 
288 #pragma unroll
289  for(int k = 0; k < LX; ++k){
290  ru[k] = u[ij + k*LX*LX + ele];
291  rw[k] = 0.0;
292  }
293 
294 
295  __syncthreads();
296 #pragma unroll
297  for (int k = 0; k < LX; ++k){
298  const int ijk = ij + k*LX*LX;
299  const T G00 = g11[ijk+ele];
300  const T G11 = g22[ijk+ele];
301  const T G22 = g33[ijk+ele];
302  const T G01 = g12[ijk+ele];
303  const T G02 = g13[ijk+ele];
304  const T G12 = g23[ijk+ele];
305  const T H1 = h1[ijk+ele];
306  T ttmp = 0.0;
307  shu[ij_p] = ru[k];
308  for (int l = 0; l < LX; l++){
309  ttmp += shdz[k+l*(LX+1)] * ru[l];
310  }
311  __syncthreads();
312 
313  T rtmp = 0.0;
314  T stmp = 0.0;
315 #pragma unroll
316  for (int l = 0; l < LX; l++){
317  rtmp += shdx[i+l*(LX+1)] * shu[l+j*(LX+1)];
318  stmp += shdy[j+l*(LX+1)] * shu[i+l*(LX+1)];
319  }
320  shur[ij] = H1
321  * (G00 * rtmp
322  + G01 * stmp
323  + G02 * ttmp);
324  shus[ij_p] = H1
325  * (G01 * rtmp
326  + G11 * stmp
327  + G12 * ttmp);
328  rut = H1
329  * (G02 * rtmp
330  + G12 * stmp
331  + G22 * ttmp);
332 
333  __syncthreads();
334 
335  T wijke = 0.0;
336 #pragma unroll
337  for (int l = 0; l < LX; l++){
338  wijke += shur[l+j*LX] * shdx[l+i*(LX+1)];
339  rw[l] += rut * shdz[k+l*(LX+1)];
340  wijke += shus[i+l*(LX+1)] * shdy[l + j*(LX+1)];
341  }
342  rw[k] += wijke;
343  }
344 #pragma unroll
345  for (int k = 0; k < LX; ++k){
346  w[ij + k*LX*LX + ele] = rw[k];
347  }
348 }
349 
350 #endif // __MATH_AX_HELM_KERNEL_H__
__global__ void ax_helm_kernel_1d(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__global__ void ax_helm_kernel_kstep_padded(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
__shared__ T shdyt[LX *LX]
Definition: cdtp_kernel.h:120
__shared__ T shdzt[LX *LX]
Definition: cdtp_kernel.h:121
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:115
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:114
shdxt[ij]
Definition: cdtp_kernel.h:136
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:113
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shu[LX *LX]
const int ij_p
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__shared__ T shdz[LX *LX]
T ru[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
const int i
const int ij
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
shdx[ij]
T rw[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
T rut
const int e
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
__global__ void const T *__restrict__ const T *__restrict__ dx
const int ele
__global__ void const T *__restrict__ u
const int j
__shared__ T shus[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__syncthreads()
__shared__ T shur[LX *LX]