Neko  0.9.0
A portable framework for high-order spectral element flow simulations
lambda2_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_LAMBDA2_KERNEL_H__
2 #define __MATH_LAMBDA2_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
42 template< typename T>
43 __inline__ __device__ T eigen_val_calc(T grad11, T grad12, T grad13,
44  T grad21, T grad22, T grad23,
45  T grad31, T grad32, T grad33 ) {
46  T s11 = grad11;
47  T s22 = grad22;
48  T s33 = grad33;
49  T s12 = 0.5*(grad12+grad21);
50  T s13 = 0.5*(grad13+grad31);
51  T s23 = 0.5*(grad23+grad32);
52 
53  T o12 = 0.5*(grad12-grad21);
54  T o13 = 0.5*(grad13-grad31);
55  T o23 = 0.5*(grad23-grad32);
56 
57  T a11 = s11*s11 + s12*s12 + s13*s13 - o12*o12 - o13*o13;
58  T a12 = s11 * s12 + s12 * s22 + s13 * s23 - o13 * o23;
59  T a13 = s11 * s13 + s12 * s23 + s13 * s33 + o12 * o23;
60 
61  T a22 = s12*s12 + s22*s22 + s23*s23 - o12*o12 - o23*o23;
62  T a23 = s12 * s13 + s22 * s23 + s23 * s33 - o12 * o13;
63  T a33 = s13*s13 + s23*s23 + s33*s33 - o13*o13 - o23*o23;
64 
65 
66  T B = -(a11 + a22 + a33);
67  T C = -(a12*a12 + a13*a13 + a23*a23 - a11 * a22 - a11 * a33 - a22 * a33);
68  T D = -(2.0 * a12 * a13 * a23 - a11 * a23*a23 - a22 * a13*a13
69  - a33 * a12*a12 + a11 * a22 * a33);
70 
71 
72  T q = (3.0 * C - B*B) / 9.0;
73  T r = (9.0 * C * B - 27.0 * D - 2.0 * B*B*B) / 54.0;
74 
75  T theta = acos( r / sqrt(-q*q*q) );
76  T pi = 4.0*atan(1.0);
77  T eigen1 = 2.0 * sqrt(-q) * cos(theta / 3.0) - B / 3.0;
78  T eigen2 = 2.0 * sqrt(-q) * cos((theta + 2.0 * pi) / 3.0) - B / 3.0;
79  T eigen3 = 2.0 * sqrt(-q) * cos((theta + 4.0 * pi) / 3.0) - B / 3.0;
80 
81  if (eigen1 <= eigen2 && eigen2 <= eigen3)
82  return eigen2;
83  else if (eigen3 <= eigen2 && eigen2 <= eigen1)
84  return eigen2;
85  else if (eigen1 <= eigen3 && eigen3 <= eigen2)
86  return eigen3;
87  else if (eigen2 <= eigen3 && eigen3 <= eigen1)
88  return eigen3;
89  else if (eigen2 <= eigen1 && eigen1 <= eigen3)
90  return eigen1;
91  else if (eigen3 <= eigen1 && eigen1 <= eigen2)
92  return eigen1;
93  return 0.0;
94 }
95 
96 
97 template< typename T, const int LX, const int CHUNKS >
98 __global__ void lambda2_kernel_1d(T * __restrict__ lambda2,
99  const T * __restrict__ u,
100  const T * __restrict__ v,
101  const T * __restrict__ w,
102  const T * __restrict__ dx,
103  const T * __restrict__ dy,
104  const T * __restrict__ dz,
105  const T * __restrict__ drdx,
106  const T * __restrict__ dsdx,
107  const T * __restrict__ dtdx,
108  const T * __restrict__ drdy,
109  const T * __restrict__ dsdy,
110  const T * __restrict__ dtdy,
111  const T * __restrict__ drdz,
112  const T * __restrict__ dsdz,
113  const T * __restrict__ dtdz,
114  const T * __restrict__ jacinv) {
115 
116  __shared__ T shu[LX * LX * LX];
117  __shared__ T shv[LX * LX * LX];
118  __shared__ T shw[LX * LX * LX];
119 
120  __shared__ T shdx[LX * LX];
121  __shared__ T shdy[LX * LX];
122  __shared__ T shdz[LX * LX];
123 
124 
125 
126 
127  int i,j,k;
128 
129  const int e = blockIdx.x;
130  const int ele = blockIdx.x*LX*LX*LX;
131  const int iii = threadIdx.x;
132  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
133 
134 
135  if (iii < (LX * LX)) {
136  shdx[iii] = dx[iii];
137  shdy[iii] = dy[iii];
138  shdz[iii] = dz[iii];
139  }
140 
141  j = iii;
142  while(j < (LX * LX * LX)) {
143  shu[j] = u[j + ele];
144  shv[j] = v[j + ele];
145  shw[j] = w[j + ele];
146  j = j + CHUNKS;
147  }
148 
149  __syncthreads();
150 
151  for (int n = 0; n < nchunks; n++) {
152  const int ijk = iii + n * CHUNKS;
153  const int jk = ijk / LX;
154  i = ijk - jk * LX;
155  k = jk / LX;
156  j = jk - k * LX;
157  if ( i < LX && j < LX && k < LX ) {
158  T rtmpu = 0.0;
159  T stmpu = 0.0;
160  T ttmpu = 0.0;
161 
162  T rtmpv = 0.0;
163  T stmpv = 0.0;
164  T ttmpv = 0.0;
165 
166  T rtmpw = 0.0;
167  T stmpw = 0.0;
168  T ttmpw = 0.0;
169  for (int l = 0; l < LX; l++) {
170  rtmpu += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
171  stmpu += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
172  ttmpu += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
173 
174  rtmpv += shdx[i + l * LX] * shv[l + j * LX + k * LX * LX];
175  stmpv += shdy[j + l * LX] * shv[i + l * LX + k * LX * LX];
176  ttmpv += shdz[k + l * LX] * shv[i + j * LX + l * LX * LX];
177 
178  rtmpw += shdx[i + l * LX] * shw[l + j * LX + k * LX * LX];
179  stmpw += shdy[j + l * LX] * shw[i + l * LX + k * LX * LX];
180  ttmpw += shdz[k + l * LX] * shw[i + j * LX + l * LX * LX];
181  }
182 
183  T jinv = jacinv[ijk + ele];
184 
185  T grad11 = jinv
186  * (drdx[ijk + ele] * rtmpu
187  + dsdx[ijk + ele] * stmpu
188  + dtdx[ijk + ele] * ttmpu);
189 
190  T grad12 = jinv
191  * (drdy[ijk + ele] * rtmpu
192  + dsdy[ijk + ele] * stmpu
193  + dtdy[ijk + ele] * ttmpu);
194 
195  T grad13 = jinv
196  * (drdz[ijk + ele] * rtmpu
197  + dsdz[ijk + ele] * stmpu
198  + dtdz[ijk + ele] * ttmpu);
199 
200  T grad21 = jinv
201  * (drdx[ijk + ele] * rtmpv
202  + dsdx[ijk + ele] * stmpv
203  + dtdx[ijk + ele] * ttmpv);
204 
205  T grad22 = jinv
206  * (drdy[ijk + ele] * rtmpv
207  + dsdy[ijk + ele] * stmpv
208  + dtdy[ijk + ele] * ttmpv);
209 
210  T grad23 = jinv
211  * (drdz[ijk + ele] * rtmpv
212  + dsdz[ijk + ele] * stmpv
213  + dtdz[ijk + ele] * ttmpv);
214 
215 
216  T grad31 = jinv
217  * (drdx[ijk + ele] * rtmpw
218  + dsdx[ijk + ele] * stmpw
219  + dtdx[ijk + ele] * ttmpw);
220 
221  T grad32 = jinv
222  * (drdy[ijk + ele] * rtmpw
223  + dsdy[ijk + ele] * stmpw
224  + dtdy[ijk + ele] * ttmpw);
225 
226  T grad33 = jinv
227  * (drdz[ijk + ele] * rtmpw
228  + dsdz[ijk + ele] * stmpw
229  + dtdz[ijk + ele] * ttmpw);
230  lambda2[ijk + e*LX*LX*LX] = eigen_val_calc<T>( grad11, grad12, grad13,
231  grad21, grad22, grad23,
232  grad31, grad32, grad33);
233  }
234  }
235 
236 }
237 
238 template< typename T, const int LX >
239 __global__ void __launch_bounds__(LX*LX,3)
240 lambda2_kernel_kstep(T * __restrict__ lambda2,
241  const T * __restrict__ u,
242  const T * __restrict__ v,
243  const T * __restrict__ w,
244  const T * __restrict__ dx,
245  const T * __restrict__ dy,
246  const T * __restrict__ dz,
247  const T * __restrict__ drdx,
248  const T * __restrict__ dsdx,
249  const T * __restrict__ dtdx,
250  const T * __restrict__ drdy,
251  const T * __restrict__ dsdy,
252  const T * __restrict__ dtdy,
253  const T * __restrict__ drdz,
254  const T * __restrict__ dsdz,
255  const T * __restrict__ dtdz,
256  const T * __restrict__ jacinv) {
257 
258  __shared__ T shu[LX * LX];
259  __shared__ T shv[LX * LX];
260  __shared__ T shw[LX * LX];
261 
262  __shared__ T shdx[LX * LX];
263  __shared__ T shdy[LX * LX];
264  __shared__ T shdz[LX * LX];
265 
266  const int e = blockIdx.x;
267  const int j = threadIdx.y;
268  const int i = threadIdx.x;
269  const int ij = i + j * LX;
270  const int ele = e*LX*LX*LX;
271 
272  shdx[ij] = dx[ij];
273  shdy[ij] = dy[ij];
274  shdz[ij] = dz[ij];
275 
276  T ru[LX];
277  T rv[LX];
278  T rw[LX];
279 
280 #pragma unroll LX
281  for (int k = 0; k < LX; ++k) {
282  ru[k] = u[ij + k*LX*LX + ele];
283  rv[k] = v[ij + k*LX*LX + ele];
284  rw[k] = w[ij + k*LX*LX + ele];
285  }
286 
288 
289  #pragma unroll
290  for (int k = 0; k < LX; ++k) {
291  const int ijk = ij + k*LX*LX;
292  const T jinv = jacinv[ijk+ele];
293  T ttmpu = 0.0;
294  T ttmpv = 0.0;
295  T ttmpw = 0.0;
296  shu[ij] = ru[k];
297  shv[ij] = rv[k];
298  shw[ij] = rw[k];
299  for (int l = 0; l < LX; l++) {
300  ttmpu += shdz[k+l*LX] * ru[l];
301  ttmpv += shdz[k+l*LX] * rv[l];
302  ttmpw += shdz[k+l*LX] * rw[l];
303  }
304  __syncthreads();
305 
306  T rtmpu = 0.0;
307  T stmpu = 0.0;
308  T rtmpv = 0.0;
309  T stmpv = 0.0;
310  T rtmpw = 0.0;
311  T stmpw = 0.0;
312 #pragma unroll
313  for (int l = 0; l < LX; l++) {
314  rtmpu += shdx[i+l*LX] * shu[l+j*LX];
315  stmpu += shdy[j+l*LX] * shu[i+l*LX];
316  rtmpv += shdx[i+l*LX] * shv[l+j*LX];
317  stmpv += shdy[j+l*LX] * shv[i+l*LX];
318  rtmpw += shdx[i+l*LX] * shw[l+j*LX];
319  stmpw += shdy[j+l*LX] * shw[i+l*LX];
320  }
321 
322  T grad11 = jinv * (drdx[ijk + ele] * rtmpu
323  + dsdx[ijk + ele] * stmpu
324  + dtdx[ijk + ele] * ttmpu);
325 
326  T grad12 = jinv * (drdy[ijk + ele] * rtmpu
327  + dsdy[ijk + ele] * stmpu
328  + dtdy[ijk + ele] * ttmpu);
329 
330  T grad13 = jinv * (drdz[ijk + ele] * rtmpu
331  + dsdz[ijk + ele] * stmpu
332  + dtdz[ijk + ele] * ttmpu);
333  T grad21 = jinv * (drdx[ijk + ele] * rtmpv
334  + dsdx[ijk + ele] * stmpv
335  + dtdx[ijk + ele] * ttmpv);
336 
337  T grad22 = jinv * (drdy[ijk + ele] * rtmpv
338  + dsdy[ijk + ele] * stmpv
339  + dtdy[ijk + ele] * ttmpv);
340 
341  T grad23 = jinv * (drdz[ijk + ele] * rtmpv
342  + dsdz[ijk + ele] * stmpv
343  + dtdz[ijk + ele] * ttmpv);
344  T grad31 = jinv * (drdx[ijk + ele] * rtmpw
345  + dsdx[ijk + ele] * stmpw
346  + dtdx[ijk + ele] * ttmpw);
347 
348  T grad32 = jinv * (drdy[ijk + ele] * rtmpw
349  + dsdy[ijk + ele] * stmpw
350  + dtdy[ijk + ele] * ttmpw);
351 
352  T grad33 = jinv * (drdz[ijk + ele] * rtmpw
353  + dsdz[ijk + ele] * stmpw
354  + dtdz[ijk + ele] * ttmpw);
355  lambda2[ijk + ele] = eigen_val_calc<T>( grad11, grad12, grad13,
356  grad21, grad22, grad23,
357  grad31, grad32, grad33);
358  __syncthreads();
359  }
360 }
361 
362 
363 #endif // __MATH_LAMBDA2_KERNEL_H__
__shared__ T shu[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void lambda2_kernel_1d(T *__restrict__ lambda2, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ jacinv)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
T ru[LX]
T rv[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__shared__ T shdz[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
const int i
const int ij
__global__ void __launch_bounds__(LX *LX, 3) lambda2_kernel_kstep(T *__restrict__ lambda2
__shared__ T shdy[LX *LX]
T rw[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__shared__ T shw[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
const int e
__shared__ T shv[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__inline__ __device__ T eigen_val_calc(T grad11, T grad12, T grad13, T grad21, T grad22, T grad23, T grad31, T grad32, T grad33)
__shared__ T shdx[LX *LX]
const int ele
__global__ void const T *__restrict__ u
const int j
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void const T *__restrict__ const T *__restrict__ v
A simulation component that computes lambda2 The values are stored in the field registry under the na...
Definition: lambda2.f90:37
real(kind=rp), parameter, public pi
Definition: math.f90:76