Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
lambda2_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_LAMBDA2_KERNEL_H__
2#define __MATH_LAMBDA2_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
42template< typename T>
45 T grad31, T grad32, T grad33 ) {
46 T s11 = grad11;
47 T s22 = grad22;
48 T s33 = grad33;
49 T s12 = 0.5*(grad12+grad21);
50 T s13 = 0.5*(grad13+grad31);
51 T s23 = 0.5*(grad23+grad32);
52
53 T o12 = 0.5*(grad12-grad21);
54 T o13 = 0.5*(grad13-grad31);
55 T o23 = 0.5*(grad23-grad32);
56
57 T a11 = s11*s11 + s12*s12 + s13*s13 - o12*o12 - o13*o13;
58 T a12 = s11 * s12 + s12 * s22 + s13 * s23 - o13 * o23;
59 T a13 = s11 * s13 + s12 * s23 + s13 * s33 + o12 * o23;
60
61 T a22 = s12*s12 + s22*s22 + s23*s23 - o12*o12 - o23*o23;
62 T a23 = s12 * s13 + s22 * s23 + s23 * s33 - o12 * o13;
63 T a33 = s13*s13 + s23*s23 + s33*s33 - o13*o13 - o23*o23;
64
65
66 T B = -(a11 + a22 + a33);
67 T C = -(a12*a12 + a13*a13 + a23*a23 - a11 * a22 - a11 * a33 - a22 * a33);
68 T D = -(2.0 * a12 * a13 * a23 - a11 * a23*a23 - a22 * a13*a13
69 - a33 * a12*a12 + a11 * a22 * a33);
70
71
72 T q = (3.0 * C - B*B) / 9.0;
73 T r = (9.0 * C * B - 27.0 * D - 2.0 * B*B*B) / 54.0;
74
75 T theta = acos( r / sqrt(-q*q*q) );
76 T pi = 4.0*atan(1.0);
77 T eigen1 = 2.0 * sqrt(-q) * cos(theta / 3.0) - B / 3.0;
78 T eigen2 = 2.0 * sqrt(-q) * cos((theta + 2.0 * pi) / 3.0) - B / 3.0;
79 T eigen3 = 2.0 * sqrt(-q) * cos((theta + 4.0 * pi) / 3.0) - B / 3.0;
80
81 if (eigen1 <= eigen2 && eigen2 <= eigen3)
82 return eigen2;
83 else if (eigen3 <= eigen2 && eigen2 <= eigen1)
84 return eigen2;
85 else if (eigen1 <= eigen3 && eigen3 <= eigen2)
86 return eigen3;
87 else if (eigen2 <= eigen3 && eigen3 <= eigen1)
88 return eigen3;
89 else if (eigen2 <= eigen1 && eigen1 <= eigen3)
90 return eigen1;
91 else if (eigen3 <= eigen1 && eigen1 <= eigen2)
92 return eigen1;
93 return 0.0;
94}
95
96
97template< typename T, const int LX, const int CHUNKS >
99 const T * __restrict__ u,
100 const T * __restrict__ v,
101 const T * __restrict__ w,
102 const T * __restrict__ dx,
103 const T * __restrict__ dy,
104 const T * __restrict__ dz,
105 const T * __restrict__ drdx,
106 const T * __restrict__ dsdx,
107 const T * __restrict__ dtdx,
108 const T * __restrict__ drdy,
109 const T * __restrict__ dsdy,
110 const T * __restrict__ dtdy,
111 const T * __restrict__ drdz,
112 const T * __restrict__ dsdz,
113 const T * __restrict__ dtdz,
114 const T * __restrict__ jacinv) {
115
116 __shared__ T shu[LX * LX * LX];
117 __shared__ T shv[LX * LX * LX];
118 __shared__ T shw[LX * LX * LX];
119
120 __shared__ T shdx[LX * LX];
121 __shared__ T shdy[LX * LX];
122 __shared__ T shdz[LX * LX];
123
124
125
126
127 int i,j,k;
128
129 const int e = blockIdx.x;
130 const int ele = blockIdx.x*LX*LX*LX;
131 const int iii = threadIdx.x;
132 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
133
134
135 if (iii < (LX * LX)) {
136 shdx[iii] = dx[iii];
137 shdy[iii] = dy[iii];
138 shdz[iii] = dz[iii];
139 }
140
141 j = iii;
142 while(j < (LX * LX * LX)) {
143 shu[j] = u[j + ele];
144 shv[j] = v[j + ele];
145 shw[j] = w[j + ele];
146 j = j + CHUNKS;
147 }
148
150
151 for (int n = 0; n < nchunks; n++) {
152 const int ijk = iii + n * CHUNKS;
153 const int jk = ijk / LX;
154 i = ijk - jk * LX;
155 k = jk / LX;
156 j = jk - k * LX;
157 if ( i < LX && j < LX && k < LX ) {
158 T rtmpu = 0.0;
159 T stmpu = 0.0;
160 T ttmpu = 0.0;
161
162 T rtmpv = 0.0;
163 T stmpv = 0.0;
164 T ttmpv = 0.0;
165
166 T rtmpw = 0.0;
167 T stmpw = 0.0;
168 T ttmpw = 0.0;
169 for (int l = 0; l < LX; l++) {
170 rtmpu += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
171 stmpu += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
172 ttmpu += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
173
174 rtmpv += shdx[i + l * LX] * shv[l + j * LX + k * LX * LX];
175 stmpv += shdy[j + l * LX] * shv[i + l * LX + k * LX * LX];
176 ttmpv += shdz[k + l * LX] * shv[i + j * LX + l * LX * LX];
177
178 rtmpw += shdx[i + l * LX] * shw[l + j * LX + k * LX * LX];
179 stmpw += shdy[j + l * LX] * shw[i + l * LX + k * LX * LX];
180 ttmpw += shdz[k + l * LX] * shw[i + j * LX + l * LX * LX];
181 }
182
183 T jinv = jacinv[ijk + ele];
184
185 T grad11 = jinv
186 * (drdx[ijk + ele] * rtmpu
187 + dsdx[ijk + ele] * stmpu
188 + dtdx[ijk + ele] * ttmpu);
189
190 T grad12 = jinv
191 * (drdy[ijk + ele] * rtmpu
192 + dsdy[ijk + ele] * stmpu
193 + dtdy[ijk + ele] * ttmpu);
194
195 T grad13 = jinv
196 * (drdz[ijk + ele] * rtmpu
197 + dsdz[ijk + ele] * stmpu
198 + dtdz[ijk + ele] * ttmpu);
199
200 T grad21 = jinv
201 * (drdx[ijk + ele] * rtmpv
202 + dsdx[ijk + ele] * stmpv
203 + dtdx[ijk + ele] * ttmpv);
204
205 T grad22 = jinv
206 * (drdy[ijk + ele] * rtmpv
207 + dsdy[ijk + ele] * stmpv
208 + dtdy[ijk + ele] * ttmpv);
209
210 T grad23 = jinv
211 * (drdz[ijk + ele] * rtmpv
212 + dsdz[ijk + ele] * stmpv
213 + dtdz[ijk + ele] * ttmpv);
214
215
216 T grad31 = jinv
217 * (drdx[ijk + ele] * rtmpw
218 + dsdx[ijk + ele] * stmpw
219 + dtdx[ijk + ele] * ttmpw);
220
221 T grad32 = jinv
222 * (drdy[ijk + ele] * rtmpw
223 + dsdy[ijk + ele] * stmpw
224 + dtdy[ijk + ele] * ttmpw);
225
226 T grad33 = jinv
227 * (drdz[ijk + ele] * rtmpw
228 + dsdz[ijk + ele] * stmpw
229 + dtdz[ijk + ele] * ttmpw);
233 }
234 }
235
236}
237
238template< typename T, const int LX >
257
258 __shared__ T shu[LX * LX];
261
265
266 const int e = blockIdx.x;
267 const int j = threadIdx.y;
268 const int i = threadIdx.x;
269 const int ij = i + j * LX;
270 const int ele = e*LX*LX*LX;
271
272 shdx[ij] = dx[ij];
273 shdy[ij] = dy[ij];
274 shdz[ij] = dz[ij];
275
279
280#pragma unroll LX
281 for (int k = 0; k < LX; ++k) {
282 ru[k] = u[ij + k*LX*LX + ele];
283 rv[k] = v[ij + k*LX*LX + ele];
284 rw[k] = w[ij + k*LX*LX + ele];
285 }
286
288
289 #pragma unroll
290 for (int k = 0; k < LX; ++k) {
291 const int ijk = ij + k*LX*LX;
292 const T jinv = jacinv[ijk+ele];
293 T ttmpu = 0.0;
294 T ttmpv = 0.0;
295 T ttmpw = 0.0;
296 shu[ij] = ru[k];
297 shv[ij] = rv[k];
298 shw[ij] = rw[k];
299 for (int l = 0; l < LX; l++) {
300 ttmpu += shdz[k+l*LX] * ru[l];
301 ttmpv += shdz[k+l*LX] * rv[l];
302 ttmpw += shdz[k+l*LX] * rw[l];
303 }
305
306 T rtmpu = 0.0;
307 T stmpu = 0.0;
308 T rtmpv = 0.0;
309 T stmpv = 0.0;
310 T rtmpw = 0.0;
311 T stmpw = 0.0;
312#pragma unroll
313 for (int l = 0; l < LX; l++) {
314 rtmpu += shdx[i+l*LX] * shu[l+j*LX];
315 stmpu += shdy[j+l*LX] * shu[i+l*LX];
316 rtmpv += shdx[i+l*LX] * shv[l+j*LX];
317 stmpv += shdy[j+l*LX] * shv[i+l*LX];
318 rtmpw += shdx[i+l*LX] * shw[l+j*LX];
319 stmpw += shdy[j+l*LX] * shw[i+l*LX];
320 }
321
322 T grad11 = jinv * (drdx[ijk + ele] * rtmpu
323 + dsdx[ijk + ele] * stmpu
324 + dtdx[ijk + ele] * ttmpu);
325
326 T grad12 = jinv * (drdy[ijk + ele] * rtmpu
327 + dsdy[ijk + ele] * stmpu
328 + dtdy[ijk + ele] * ttmpu);
329
330 T grad13 = jinv * (drdz[ijk + ele] * rtmpu
331 + dsdz[ijk + ele] * stmpu
332 + dtdz[ijk + ele] * ttmpu);
333 T grad21 = jinv * (drdx[ijk + ele] * rtmpv
334 + dsdx[ijk + ele] * stmpv
335 + dtdx[ijk + ele] * ttmpv);
336
337 T grad22 = jinv * (drdy[ijk + ele] * rtmpv
338 + dsdy[ijk + ele] * stmpv
339 + dtdy[ijk + ele] * ttmpv);
340
341 T grad23 = jinv * (drdz[ijk + ele] * rtmpv
342 + dsdz[ijk + ele] * stmpv
343 + dtdz[ijk + ele] * ttmpv);
344 T grad31 = jinv * (drdx[ijk + ele] * rtmpw
345 + dsdx[ijk + ele] * stmpw
346 + dtdx[ijk + ele] * ttmpw);
347
348 T grad32 = jinv * (drdy[ijk + ele] * rtmpw
349 + dsdy[ijk + ele] * stmpw
350 + dtdy[ijk + ele] * ttmpw);
351
352 T grad33 = jinv * (drdz[ijk + ele] * rtmpw
353 + dsdz[ijk + ele] * stmpw
354 + dtdz[ijk + ele] * ttmpw);
359 }
360}
361
362
363#endif // __MATH_LAMBDA2_KERNEL_H__
__shared__ T shu[LX *LX]
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void lambda2_kernel_1d(T *__restrict__ lambda2, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ jacinv)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
T ru[LX]
T rv[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__shared__ T shdz[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
const int i
const int ij
__global__ void __launch_bounds__(LX *LX, 3) lambda2_kernel_kstep(T *__restrict__ lambda2
__shared__ T shdy[LX *LX]
T rw[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__shared__ T shw[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
const int e
__shared__ T shv[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__inline__ __device__ T eigen_val_calc(T grad11, T grad12, T grad13, T grad21, T grad22, T grad23, T grad31, T grad32, T grad33)
__shared__ T shdx[LX *LX]
const int ele
__global__ void const T *__restrict__ u
const int j
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void const T *__restrict__ const T *__restrict__ v
A simulation component that computes lambda2 The values are stored in the field registry under the na...
Definition lambda2.f90:37