Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
ax_helm_full_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_AX_HELM_FULL_KERNEL_H__
2#define __MATH_AX_HELM_FULL_KERNEL_H__
3/*
4 Copyright (c) 2024, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
41template< typename T, const int LX >
64
68
72
76
80
84
88
92
93 const int e = blockIdx.x;
94 const int j = threadIdx.y;
95 const int i = threadIdx.x;
96 const int ij = i + j*LX;
97 const int ele = e*LX*LX*LX;
98
99 shdx[ij] = dx[ij];
100 shdy[ij] = dy[ij];
101 shdz[ij] = dz[ij];
102
103#pragma unroll
104 for(int k = 0; k < LX; ++k){
105 ru[k] = u[ij + k*LX*LX + ele];
106 ruw[k] = 0.0;
107
108 rv[k] = v[ij + k*LX*LX + ele];
109 rvw[k] = 0.0;
110
111 rw[k] = w[ij + k*LX*LX + ele];
112 rww[k] = 0.0;
113 }
114
115
117#pragma unroll
118 for (int k = 0; k < LX; ++k){
119 const int ijk = ij + k*LX*LX;
120 const T drdx_local = drdx[ijk+ele];
121 const T drdy_local = drdy[ijk+ele];
122 const T drdz_local = drdz[ijk+ele];
123 const T dsdx_local = dsdx[ijk+ele];
124 const T dsdy_local = dsdy[ijk+ele];
125 const T dsdz_local = dsdz[ijk+ele];
126 const T dtdx_local = dtdx[ijk+ele];
127 const T dtdy_local = dtdy[ijk+ele];
128 const T dtdz_local = dtdz[ijk+ele];
129 const T dj = h1[ijk+ele] *
130 weight3[ijk] *
131 jacinv[ijk+ele];
132
133 T uttmp = 0.0;
134 T vttmp = 0.0;
135 T wttmp = 0.0;
136 shu[ij] = ru[k];
137 shv[ij] = rv[k];
138 shw[ij] = rw[k];
139 for (int l = 0; l < LX; l++){
140 uttmp += shdz[k+l*LX] * ru[l];
141 vttmp += shdz[k+l*LX] * rv[l];
142 wttmp += shdz[k+l*LX] * rw[l];
143 }
145
146 T urtmp = 0.0;
147 T ustmp = 0.0;
148
149 T vrtmp = 0.0;
150 T vstmp = 0.0;
151
152 T wrtmp = 0.0;
153 T wstmp = 0.0;
154#pragma unroll
155 for (int l = 0; l < LX; l++){
156 urtmp += shdx[i+l*LX] * shu[l+j*LX];
157 ustmp += shdy[j+l*LX] * shu[i+l*LX];
158
159 vrtmp += shdx[i+l*LX] * shv[l+j*LX];
160 vstmp += shdy[j+l*LX] * shv[i+l*LX];
161
162 wrtmp += shdx[i+l*LX] * shw[l+j*LX];
163 wstmp += shdy[j+l*LX] * shw[i+l*LX];
164 }
165
166 T u1 = 0.0;
167 T u2 = 0.0;
168 T u3 = 0.0;
169 T v1 = 0.0;
170 T v2 = 0.0;
171 T v3 = 0.0;
172 T w1 = 0.0;
173 T w2 = 0.0;
174 T w3 = 0.0;
175
176 u1 = urtmp * drdx_local +
177 ustmp * dsdx_local +
179 u2 = urtmp * drdy_local +
180 ustmp * dsdy_local +
182 u3 = urtmp * drdz_local +
183 ustmp * dsdz_local +
185
186 v1 = vrtmp * drdx_local +
187 vstmp * dsdx_local +
189 v2 = vrtmp * drdy_local +
190 vstmp * dsdy_local +
192 v3 = vrtmp * drdz_local +
193 vstmp * dsdz_local +
195
196 w1 = wrtmp * drdx_local +
197 wstmp * dsdx_local +
199 w2 = wrtmp * drdy_local +
200 wstmp * dsdy_local +
202 w3 = wrtmp * drdz_local +
203 wstmp * dsdz_local +
205
206 T s11 = 0.0;
207 T s12 = 0.0;
208 T s13 = 0.0;
209 T s21 = 0.0;
210 T s22 = 0.0;
211 T s23 = 0.0;
212 T s31 = 0.0;
213 T s32 = 0.0;
214 T s33 = 0.0;
215
216 s11 = dj*(u1 + u1);
217 s12 = dj*(u2 + v1);
218 s13 = dj*(u3 + w1);
219 s21 = dj*(v1 + u2);
220 s22 = dj*(v2 + v2);
221 s23 = dj*(v3 + w2);
222 s31 = dj*(w1 + u3);
223 s32 = dj*(w2 + v3);
224 s33 = dj*(w3 + w3);
225
226 shur[ij] = drdx_local * s11 +
227 drdy_local * s12 +
228 drdz_local * s13;
229 shus[ij] = dsdx_local * s11 +
230 dsdy_local * s12 +
231 dsdz_local * s13;
232 rut = dtdx_local * s11 +
233 dtdy_local * s12 +
234 dtdz_local * s13;
235
236 shvr[ij] = drdx_local * s21 +
237 drdy_local * s22 +
238 drdz_local * s23;
239 shvs[ij] = dsdx_local * s21 +
240 dsdy_local * s22 +
241 dsdz_local * s23;
242 rvt = dtdx_local * s21 +
243 dtdy_local * s22 +
244 dtdz_local * s23;
245
246 shwr[ij] = drdx_local * s31 +
247 drdy_local * s32 +
248 drdz_local * s33;
249 shws[ij] = dsdx_local * s31 +
250 dsdy_local * s32 +
251 dsdz_local * s33;
252 rwt = dtdx_local * s31 +
253 dtdy_local * s32 +
254 dtdz_local * s33;
255
257
258 T uwijke = 0.0;
259 T vwijke = 0.0;
260 T wwijke = 0.0;
261#pragma unroll
262 for (int l = 0; l < LX; l++){
263 uwijke += shur[l+j*LX] * shdx[l+i*LX];
264 ruw[l] += rut * shdz[k+l*LX];
265 uwijke += shus[i+l*LX] * shdy[l + j*LX];
266
267 vwijke += shvr[l+j*LX] * shdx[l+i*LX];
268 rvw[l] += rvt * shdz[k+l*LX];
269 vwijke += shvs[i+l*LX] * shdy[l + j*LX];
270
271 wwijke += shwr[l+j*LX] * shdx[l+i*LX];
272 rww[l] += rwt * shdz[k+l*LX];
273 wwijke += shws[i+l*LX] * shdy[l + j*LX];
274 }
275 ruw[k] += uwijke;
276 rvw[k] += vwijke;
277 rww[k] += wwijke;
278 }
279#pragma unroll
280 for (int k = 0; k < LX; ++k){
281 au[ij + k*LX*LX + ele] = ruw[k];
282 av[ij + k*LX*LX + ele] = rvw[k];
283 aw[ij + k*LX*LX + ele] = rww[k];
284 }
285}
286
287template< typename T, const int LX >
290 T * __restrict__ av,
291 T * __restrict__ aw,
310
311 __shared__ T shdx[LX * (LX+1)];
312 __shared__ T shdy[LX * (LX+1)];
313 __shared__ T shdz[LX * (LX+1)];
314
315 __shared__ T shu[LX * (LX+1)];
316 __shared__ T shur[LX * LX];
317 __shared__ T shus[LX * (LX+1)];
318
319 __shared__ T shv[LX * (LX+1)];
320 __shared__ T shvr[LX * LX];
321 __shared__ T shvs[LX * (LX+1)];
322
323 __shared__ T shw[LX * (LX+1)];
324 __shared__ T shwr[LX * LX];
325 __shared__ T shws[LX * (LX+1)];
326
327 T ru[LX];
328 T rv[LX];
329 T rw[LX];
330
331 T ruw[LX];
332 T rvw[LX];
333 T rww[LX];
334
335 T rut;
336 T rvt;
337 T rwt;
338
339 const int e = blockIdx.x;
340 const int j = threadIdx.y;
341 const int i = threadIdx.x;
342 const int ij = i + j*LX;
343 const int ij_p = i + j*(LX+1);
344 const int ele = e*LX*LX*LX;
345
346 shdx[ij_p] = dx[ij];
347 shdy[ij_p] = dy[ij];
348 shdz[ij_p] = dz[ij];
349
350#pragma unroll
351 for(int k = 0; k < LX; ++k){
352 ru[k] = u[ij + k*LX*LX + ele];
353 ruw[k] = 0.0;
354
355 rv[k] = v[ij + k*LX*LX + ele];
356 rvw[k] = 0.0;
357
358 rw[k] = w[ij + k*LX*LX + ele];
359 rww[k] = 0.0;
360 }
361
362
364#pragma unroll
365 for (int k = 0; k < LX; ++k){
366 const int ijk = ij + k*LX*LX;
367 const T drdx_local = drdx[ijk+ele];
368 const T drdy_local = drdy[ijk+ele];
369 const T drdz_local = drdz[ijk+ele];
370 const T dsdx_local = dsdx[ijk+ele];
371 const T dsdy_local = dsdy[ijk+ele];
372 const T dsdz_local = dsdz[ijk+ele];
373 const T dtdx_local = dtdx[ijk+ele];
374 const T dtdy_local = dtdy[ijk+ele];
375 const T dtdz_local = dtdz[ijk+ele];
376 const T dj = h1[ijk+ele] *
377 weight3[ijk] *
378 jacinv[ijk+ele];
379
380 T uttmp = 0.0;
381 T vttmp = 0.0;
382 T wttmp = 0.0;
383 shu[ij_p] = ru[k];
384 shv[ij_p] = rv[k];
385 shw[ij_p] = rw[k];
386 for (int l = 0; l < LX; l++){
387 uttmp += shdz[k+l*(LX+1)] * ru[l];
388 vttmp += shdz[k+l*(LX+1)] * rv[l];
389 wttmp += shdz[k+l*(LX+1)] * rw[l];
390 }
392
393 T urtmp = 0.0;
394 T ustmp = 0.0;
395
396 T vrtmp = 0.0;
397 T vstmp = 0.0;
398
399 T wrtmp = 0.0;
400 T wstmp = 0.0;
401#pragma unroll
402 for (int l = 0; l < LX; l++){
403 urtmp += shdx[i+l*(LX+1)] * shu[l+j*(LX+1)];
404 ustmp += shdy[j+l*(LX+1)] * shu[i+l*(LX+1)];
405
406 vrtmp += shdx[i+l*(LX+1)] * shv[l+j*(LX+1)];
407 vstmp += shdy[j+l*(LX+1)] * shv[i+l*(LX+1)];
408
409 wrtmp += shdx[i+l*(LX+1)] * shw[l+j*(LX+1)];
410 wstmp += shdy[j+l*(LX+1)] * shw[i+l*(LX+1)];
411 }
412
413 T u1 = 0.0;
414 T u2 = 0.0;
415 T u3 = 0.0;
416 T v1 = 0.0;
417 T v2 = 0.0;
418 T v3 = 0.0;
419 T w1 = 0.0;
420 T w2 = 0.0;
421 T w3 = 0.0;
422
423 u1 = urtmp * drdx_local +
424 ustmp * dsdx_local +
426 u2 = urtmp * drdy_local +
427 ustmp * dsdy_local +
429 u3 = urtmp * drdz_local +
430 ustmp * dsdz_local +
432
433 v1 = vrtmp * drdx_local +
434 vstmp * dsdx_local +
436 v2 = vrtmp * drdy_local +
437 vstmp * dsdy_local +
439 v3 = vrtmp * drdz_local +
440 vstmp * dsdz_local +
442
443 w1 = wrtmp * drdx_local +
444 wstmp * dsdx_local +
446 w2 = wrtmp * drdy_local +
447 wstmp * dsdy_local +
449 w3 = wrtmp * drdz_local +
450 wstmp * dsdz_local +
452
453 T s11 = 0.0;
454 T s12 = 0.0;
455 T s13 = 0.0;
456 T s21 = 0.0;
457 T s22 = 0.0;
458 T s23 = 0.0;
459 T s31 = 0.0;
460 T s32 = 0.0;
461 T s33 = 0.0;
462
463 s11 = dj*(u1 + u1);
464 s12 = dj*(u2 + v1);
465 s13 = dj*(u3 + w1);
466 s21 = dj*(v1 + u2);
467 s22 = dj*(v2 + v2);
468 s23 = dj*(v3 + w2);
469 s31 = dj*(w1 + u3);
470 s32 = dj*(w2 + v3);
471 s33 = dj*(w3 + w3);
472
473 shur[ij] = drdx_local * s11 +
474 drdy_local * s12 +
475 drdz_local * s13;
476 shus[ij_p] = dsdx_local * s11 +
477 dsdy_local * s12 +
478 dsdz_local * s13;
479 rut = dtdx_local * s11 +
480 dtdy_local * s12 +
481 dtdz_local * s13;
482
483 shvr[ij] = drdx_local * s21 +
484 drdy_local * s22 +
485 drdz_local * s23;
486 shvs[ij_p] = dsdx_local * s21 +
487 dsdy_local * s22 +
488 dsdz_local * s23;
489 rvt = dtdx_local * s21 +
490 dtdy_local * s22 +
491 dtdz_local * s23;
492
493 shwr[ij] = drdx_local * s31 +
494 drdy_local * s32 +
495 drdz_local * s33;
496 shws[ij_p] = dsdx_local * s31 +
497 dsdy_local * s32 +
498 dsdz_local * s33;
499 rwt = dtdx_local * s31 +
500 dtdy_local * s32 +
501 dtdz_local * s33;
502
504
505 T uwijke = 0.0;
506 T vwijke = 0.0;
507 T wwijke = 0.0;
508#pragma unroll
509 for (int l = 0; l < LX; l++){
510 uwijke += shur[l+j*LX] * shdx[l+i*(LX+1)];
511 ruw[l] += rut * shdz[k+l*(LX+1)];
512 uwijke += shus[i+l*(LX+1)] * shdy[l + j*(LX+1)];
513
514 vwijke += shvr[l+j*LX] * shdx[l+i*(LX+1)];
515 rvw[l] += rvt * shdz[k+l*(LX+1)];
516 vwijke += shvs[i+l*(LX+1)] * shdy[l + j*(LX+1)];
517
518 wwijke += shwr[l+j*LX] * shdx[l+i*(LX+1)];
519 rww[l] += rwt * shdz[k+l*(LX+1)];
520 wwijke += shws[i+l*(LX+1)] * shdy[l + j*(LX+1)];
521 }
522 ruw[k] += uwijke;
523 rvw[k] += vwijke;
524 rww[k] += wwijke;
525 }
526#pragma unroll
527 for (int k = 0; k < LX; ++k){
528 au[ij + k*LX*LX + ele] = ruw[k];
529 av[ij + k*LX*LX + ele] = rvw[k];
530 aw[ij + k*LX*LX + ele] = rww[k];
531 }
532}
533
534template< typename T >
536 T * __restrict__ av,
537 T * __restrict__ aw,
538 const T * __restrict__ u,
539 const T * __restrict__ v,
540 const T * __restrict__ w,
541 const T * __restrict__ h2,
542 const T * __restrict__ B,
543 const int n) {
544
545 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
546 const int str = blockDim.x * gridDim.x;
547
548 for (int i = idx; i < n; i += str) {
549 au[i] = au[i] + h2[i] * B[i] * u[i];
550 av[i] = av[i] + h2[i] * B[i] * v[i];
551 aw[i] = aw[i] + h2[i] * B[i] * w[i];
552 }
553
554}
555#endif // __MATH_AX_HELM_FULL_KERNEL_H__
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
T rv[LX]
__shared__ T shu[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ weight3
const int ij_p
__shared__ T shdy[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_stress_kernel_vector_kstep(T *__restrict__ au
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__shared__ T shw[LX *LX]
T rvw[LX]
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdz[LX *LX]
T rww[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shvs[LX *LX]
__shared__ T shws[LX *LX]
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
const int ij
shdx[ij]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
const int e
__global__ void T *__restrict__ av
T ruw[LX]
T ru[LX]
__global__ void ax_helm_stress_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__shared__ T shwr[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
const int ele
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
T rw[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__shared__ T shus[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__shared__ T shv[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__shared__ T shvr[LX *LX]
__shared__ T shur[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)