Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
ax_helm_full_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_AX_HELM_FULL_KERNEL_H__
2#define __MATH_AX_HELM_FULL_KERNEL_H__
3/*
4 Copyright (c) 2024, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
41template< typename T, const int LX >
64
68
72
76
80
84
88
92
93 const int e = blockIdx.x;
94 const int j = threadIdx.y;
95 const int i = threadIdx.x;
96 const int ij = i + j*LX;
97 const int ele = e*LX*LX*LX;
98
99 shdx[ij] = dx[ij];
100 shdy[ij] = dy[ij];
101 shdz[ij] = dz[ij];
102
103#pragma unroll
104 for(int k = 0; k < LX; ++k){
105 ru[k] = u[ij + k*LX*LX + ele];
106 ruw[k] = 0.0;
107
108 rv[k] = v[ij + k*LX*LX + ele];
109 rvw[k] = 0.0;
110
111 rw[k] = w[ij + k*LX*LX + ele];
112 rww[k] = 0.0;
113 }
114
115
117#pragma unroll
118 for (int k = 0; k < LX; ++k){
119 const int ijk = ij + k*LX*LX;
120 const T drdx_local = drdx[ijk+ele];
121 const T drdy_local = drdy[ijk+ele];
122 const T drdz_local = drdz[ijk+ele];
123 const T dsdx_local = dsdx[ijk+ele];
124 const T dsdy_local = dsdy[ijk+ele];
125 const T dsdz_local = dsdz[ijk+ele];
126 const T dtdx_local = dtdx[ijk+ele];
127 const T dtdy_local = dtdy[ijk+ele];
128 const T dtdz_local = dtdz[ijk+ele];
129 const T dj = h1[ijk+ele] *
130 weight3[ijk] *
131 jacinv[ijk+ele];
132
133 T uttmp = 0.0;
134 T vttmp = 0.0;
135 T wttmp = 0.0;
136 shu[ij] = ru[k];
137 shv[ij] = rv[k];
138 shw[ij] = rw[k];
139 for (int l = 0; l < LX; l++){
140 uttmp += shdz[k+l*LX] * ru[l];
141 vttmp += shdz[k+l*LX] * rv[l];
142 wttmp += shdz[k+l*LX] * rw[l];
143 }
145
146 T urtmp = 0.0;
147 T ustmp = 0.0;
148
149 T vrtmp = 0.0;
150 T vstmp = 0.0;
151
152 T wrtmp = 0.0;
153 T wstmp = 0.0;
154#pragma unroll
155 for (int l = 0; l < LX; l++){
156 urtmp += shdx[i+l*LX] * shu[l+j*LX];
157 ustmp += shdy[j+l*LX] * shu[i+l*LX];
158
159 vrtmp += shdx[i+l*LX] * shv[l+j*LX];
160 vstmp += shdy[j+l*LX] * shv[i+l*LX];
161
162 wrtmp += shdx[i+l*LX] * shw[l+j*LX];
163 wstmp += shdy[j+l*LX] * shw[i+l*LX];
164 }
165
166 T u1 = 0.0;
167 T u2 = 0.0;
168 T u3 = 0.0;
169 T v1 = 0.0;
170 T v2 = 0.0;
171 T v3 = 0.0;
172 T w1 = 0.0;
173 T w2 = 0.0;
174 T w3 = 0.0;
175
176 u1 = urtmp * drdx_local +
177 ustmp * dsdx_local +
179 u2 = urtmp * drdy_local +
180 ustmp * dsdy_local +
182 u3 = urtmp * drdz_local +
183 ustmp * dsdz_local +
185
186 v1 = vrtmp * drdx_local +
187 vstmp * dsdx_local +
189 v2 = vrtmp * drdy_local +
190 vstmp * dsdy_local +
192 v3 = vrtmp * drdz_local +
193 vstmp * dsdz_local +
195
196 w1 = wrtmp * drdx_local +
197 wstmp * dsdx_local +
199 w2 = wrtmp * drdy_local +
200 wstmp * dsdy_local +
202 w3 = wrtmp * drdz_local +
203 wstmp * dsdz_local +
205
206 T s11 = 0.0;
207 T s22 = 0.0;
208 T s33 = 0.0;
209 T s12 = 0.0;
210 T s13 = 0.0;
211 T s23 = 0.0;
212
213 s11 = dj*(u1 + u1);
214 s22 = dj*(v2 + v2);
215 s33 = dj*(w3 + w3);
216 s12 = dj*(u2 + v1);
217 s13 = dj*(u3 + w1);
218 s23 = dj*(v3 + w2);
219
220 shur[ij] = drdx_local * s11 +
221 drdy_local * s12 +
222 drdz_local * s13;
223 shus[ij] = dsdx_local * s11 +
224 dsdy_local * s12 +
225 dsdz_local * s13;
226 rut = dtdx_local * s11 +
227 dtdy_local * s12 +
228 dtdz_local * s13;
229
230 shvr[ij] = drdx_local * s12 +
231 drdy_local * s22 +
232 drdz_local * s23;
233 shvs[ij] = dsdx_local * s12 +
234 dsdy_local * s22 +
235 dsdz_local * s23;
236 rvt = dtdx_local * s12 +
237 dtdy_local * s22 +
238 dtdz_local * s23;
239
240 shwr[ij] = drdx_local * s13 +
241 drdy_local * s23 +
242 drdz_local * s33;
243 shws[ij] = dsdx_local * s13 +
244 dsdy_local * s23 +
245 dsdz_local * s33;
246 rwt = dtdx_local * s13 +
247 dtdy_local * s23 +
248 dtdz_local * s33;
249
251
252 T uwijke = 0.0;
253 T vwijke = 0.0;
254 T wwijke = 0.0;
255#pragma unroll
256 for (int l = 0; l < LX; l++){
257 uwijke += shur[l+j*LX] * shdx[l+i*LX];
258 ruw[l] += rut * shdz[k+l*LX];
259 uwijke += shus[i+l*LX] * shdy[l + j*LX];
260
261 vwijke += shvr[l+j*LX] * shdx[l+i*LX];
262 rvw[l] += rvt * shdz[k+l*LX];
263 vwijke += shvs[i+l*LX] * shdy[l + j*LX];
264
265 wwijke += shwr[l+j*LX] * shdx[l+i*LX];
266 rww[l] += rwt * shdz[k+l*LX];
267 wwijke += shws[i+l*LX] * shdy[l + j*LX];
268 }
269 ruw[k] += uwijke;
270 rvw[k] += vwijke;
271 rww[k] += wwijke;
272 }
273#pragma unroll
274 for (int k = 0; k < LX; ++k){
275 au[ij + k*LX*LX + ele] = ruw[k];
276 av[ij + k*LX*LX + ele] = rvw[k];
277 aw[ij + k*LX*LX + ele] = rww[k];
278 }
279}
280
281template< typename T, const int LX >
284 T * __restrict__ av,
285 T * __restrict__ aw,
304
305 __shared__ T shdx[LX * (LX+1)];
306 __shared__ T shdy[LX * (LX+1)];
307 __shared__ T shdz[LX * (LX+1)];
308
309 __shared__ T shu[LX * (LX+1)];
310 __shared__ T shur[LX * LX];
311 __shared__ T shus[LX * (LX+1)];
312
313 __shared__ T shv[LX * (LX+1)];
314 __shared__ T shvr[LX * LX];
315 __shared__ T shvs[LX * (LX+1)];
316
317 __shared__ T shw[LX * (LX+1)];
318 __shared__ T shwr[LX * LX];
319 __shared__ T shws[LX * (LX+1)];
320
321 T ru[LX];
322 T rv[LX];
323 T rw[LX];
324
325 T ruw[LX];
326 T rvw[LX];
327 T rww[LX];
328
329 T rut;
330 T rvt;
331 T rwt;
332
333 const int e = blockIdx.x;
334 const int j = threadIdx.y;
335 const int i = threadIdx.x;
336 const int ij = i + j*LX;
337 const int ij_p = i + j*(LX+1);
338 const int ele = e*LX*LX*LX;
339
340 shdx[ij_p] = dx[ij];
341 shdy[ij_p] = dy[ij];
342 shdz[ij_p] = dz[ij];
343
344#pragma unroll
345 for(int k = 0; k < LX; ++k){
346 ru[k] = u[ij + k*LX*LX + ele];
347 ruw[k] = 0.0;
348
349 rv[k] = v[ij + k*LX*LX + ele];
350 rvw[k] = 0.0;
351
352 rw[k] = w[ij + k*LX*LX + ele];
353 rww[k] = 0.0;
354 }
355
356
358#pragma unroll
359 for (int k = 0; k < LX; ++k){
360 const int ijk = ij + k*LX*LX;
361 const T drdx_local = drdx[ijk+ele];
362 const T drdy_local = drdy[ijk+ele];
363 const T drdz_local = drdz[ijk+ele];
364 const T dsdx_local = dsdx[ijk+ele];
365 const T dsdy_local = dsdy[ijk+ele];
366 const T dsdz_local = dsdz[ijk+ele];
367 const T dtdx_local = dtdx[ijk+ele];
368 const T dtdy_local = dtdy[ijk+ele];
369 const T dtdz_local = dtdz[ijk+ele];
370 const T dj = h1[ijk+ele] *
371 weight3[ijk] *
372 jacinv[ijk+ele];
373
374 T uttmp = 0.0;
375 T vttmp = 0.0;
376 T wttmp = 0.0;
377 shu[ij_p] = ru[k];
378 shv[ij_p] = rv[k];
379 shw[ij_p] = rw[k];
380 for (int l = 0; l < LX; l++){
381 uttmp += shdz[k+l*(LX+1)] * ru[l];
382 vttmp += shdz[k+l*(LX+1)] * rv[l];
383 wttmp += shdz[k+l*(LX+1)] * rw[l];
384 }
386
387 T urtmp = 0.0;
388 T ustmp = 0.0;
389
390 T vrtmp = 0.0;
391 T vstmp = 0.0;
392
393 T wrtmp = 0.0;
394 T wstmp = 0.0;
395#pragma unroll
396 for (int l = 0; l < LX; l++){
397 urtmp += shdx[i+l*(LX+1)] * shu[l+j*(LX+1)];
398 ustmp += shdy[j+l*(LX+1)] * shu[i+l*(LX+1)];
399
400 vrtmp += shdx[i+l*(LX+1)] * shv[l+j*(LX+1)];
401 vstmp += shdy[j+l*(LX+1)] * shv[i+l*(LX+1)];
402
403 wrtmp += shdx[i+l*(LX+1)] * shw[l+j*(LX+1)];
404 wstmp += shdy[j+l*(LX+1)] * shw[i+l*(LX+1)];
405 }
406
407 T u1 = 0.0;
408 T u2 = 0.0;
409 T u3 = 0.0;
410 T v1 = 0.0;
411 T v2 = 0.0;
412 T v3 = 0.0;
413 T w1 = 0.0;
414 T w2 = 0.0;
415 T w3 = 0.0;
416
417 u1 = urtmp * drdx_local +
418 ustmp * dsdx_local +
420 u2 = urtmp * drdy_local +
421 ustmp * dsdy_local +
423 u3 = urtmp * drdz_local +
424 ustmp * dsdz_local +
426
427 v1 = vrtmp * drdx_local +
428 vstmp * dsdx_local +
430 v2 = vrtmp * drdy_local +
431 vstmp * dsdy_local +
433 v3 = vrtmp * drdz_local +
434 vstmp * dsdz_local +
436
437 w1 = wrtmp * drdx_local +
438 wstmp * dsdx_local +
440 w2 = wrtmp * drdy_local +
441 wstmp * dsdy_local +
443 w3 = wrtmp * drdz_local +
444 wstmp * dsdz_local +
446
447 T s11 = 0.0;
448 T s22 = 0.0;
449 T s33 = 0.0;
450 T s12 = 0.0;
451 T s13 = 0.0;
452 T s23 = 0.0;
453
454 s11 = dj*(u1 + u1);
455 s22 = dj*(v2 + v2);
456 s33 = dj*(w3 + w3);
457 s12 = dj*(u2 + v1);
458 s13 = dj*(u3 + w1);
459 s23 = dj*(v3 + w2);
460
461 shur[ij] = drdx_local * s11 +
462 drdy_local * s12 +
463 drdz_local * s13;
464 shus[ij_p] = dsdx_local * s11 +
465 dsdy_local * s12 +
466 dsdz_local * s13;
467 rut = dtdx_local * s11 +
468 dtdy_local * s12 +
469 dtdz_local * s13;
470
471 shvr[ij] = drdx_local * s12 +
472 drdy_local * s22 +
473 drdz_local * s23;
474 shvs[ij_p] = dsdx_local * s12 +
475 dsdy_local * s22 +
476 dsdz_local * s23;
477 rvt = dtdx_local * s12 +
478 dtdy_local * s22 +
479 dtdz_local * s23;
480
481 shwr[ij] = drdx_local * s13 +
482 drdy_local * s23 +
483 drdz_local * s33;
484 shws[ij_p] = dsdx_local * s13 +
485 dsdy_local * s23 +
486 dsdz_local * s33;
487 rwt = dtdx_local * s13 +
488 dtdy_local * s23 +
489 dtdz_local * s33;
490
492
493 T uwijke = 0.0;
494 T vwijke = 0.0;
495 T wwijke = 0.0;
496#pragma unroll
497 for (int l = 0; l < LX; l++){
498 uwijke += shur[l+j*LX] * shdx[l+i*(LX+1)];
499 ruw[l] += rut * shdz[k+l*(LX+1)];
500 uwijke += shus[i+l*(LX+1)] * shdy[l + j*(LX+1)];
501
502 vwijke += shvr[l+j*LX] * shdx[l+i*(LX+1)];
503 rvw[l] += rvt * shdz[k+l*(LX+1)];
504 vwijke += shvs[i+l*(LX+1)] * shdy[l + j*(LX+1)];
505
506 wwijke += shwr[l+j*LX] * shdx[l+i*(LX+1)];
507 rww[l] += rwt * shdz[k+l*(LX+1)];
508 wwijke += shws[i+l*(LX+1)] * shdy[l + j*(LX+1)];
509 }
510 ruw[k] += uwijke;
511 rvw[k] += vwijke;
512 rww[k] += wwijke;
513 }
514#pragma unroll
515 for (int k = 0; k < LX; ++k){
516 au[ij + k*LX*LX + ele] = ruw[k];
517 av[ij + k*LX*LX + ele] = rvw[k];
518 aw[ij + k*LX*LX + ele] = rww[k];
519 }
520}
521
522template< typename T >
524 T * __restrict__ av,
525 T * __restrict__ aw,
526 const T * __restrict__ u,
527 const T * __restrict__ v,
528 const T * __restrict__ w,
529 const T * __restrict__ h2,
530 const T * __restrict__ B,
531 const int n) {
532
533 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
534 const int str = blockDim.x * gridDim.x;
535
536 for (int i = idx; i < n; i += str) {
537 au[i] = au[i] + h2[i] * B[i] * u[i];
538 av[i] = av[i] + h2[i] * B[i] * v[i];
539 aw[i] = aw[i] + h2[i] * B[i] * w[i];
540 }
541
542}
543#endif // __MATH_AX_HELM_FULL_KERNEL_H__
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
T rv[LX]
__shared__ T shu[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ weight3
const int ij_p
__shared__ T shdy[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_stress_kernel_vector_kstep(T *__restrict__ au
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__shared__ T shw[LX *LX]
T rvw[LX]
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdz[LX *LX]
T rww[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shvs[LX *LX]
__shared__ T shws[LX *LX]
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
const int ij
shdx[ij]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
const int e
__global__ void T *__restrict__ av
T ruw[LX]
T ru[LX]
__global__ void ax_helm_stress_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__shared__ T shwr[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
const int ele
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
T rw[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__shared__ T shus[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__shared__ T shv[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__shared__ T shvr[LX *LX]
__shared__ T shur[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)