Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
most_kernel.h
Go to the documentation of this file.
1/*
2 Copyright (c) 2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MOST_KERNEL_H
36#define MOST_KERNEL_H
37
38#include <cmath>
39#include <algorithm>
40
41/*
42* Similarity laws and corrections for the STABLE regime:
43* REFERENCE: Cheng, Y., and W. Brutsaert (2005), Flux-profile relationships
44* for wind speed and temperature in the stable atmospheric boundary layer,
45* Bound.-Layer Meteorol., 3, 519-538.
46* NOTE: This formulation is chosen for its superior behavior in very stable
47* conditions (large z/L), avoiding the numerical decoupling found in older linear (e.g., Webb or Holtslag) functions.
48*/
49
50template<typename T>
52{
53 // Coefficients specific to Cheng & Brutsaert (2005)
54 T a = 1.0;
55 T b = 2.0/3.0;
56 T c = 5.0;
57 T d = 0.35;
58 T zeta = z / L_ob;
59
60 return -a*zeta - b*(zeta-c/d)*exp(-d*zeta) - b*c/d;
61}
62
63template<typename T>
65{
66 // Coefficients specific to Cheng & Brutsaert (2005)
67 T a = 1.0;
68 T b = 2.0/3.0;
69 T c = 5.0;
70 T d = 0.35;
71 T zeta = z / L_ob;
72
73 return -b*(zeta-c/d)*exp(-d*zeta)
74 -pow(1.0 + 2.0/3.0*a*zeta, 1.5)
75 -b*c/d + 1.0;
76}
77
78template<typename T>
80{
81 return log(z/z0)
84}
85
86template<typename T>
88{
89 return log(z/z0h)
92}
93
94template<typename T>
96{
97 return Ri_b - Pr*z/L_ob / (
101 );
102}
103
104template<typename T>
106 T l_lower,
107 T z,
108 T z0,
109 T z0h,
110 T fd_h,
111 T Pr)
112{
113 T up = -z/l_upper /
114 (
118 );
119
120 T low = z/l_lower /
121 (
125 );
126
127 return Pr*(up + low) / (2*fd_h);
128}
129
130template<typename T>
132{
133 return Ri_b - Pr*z/L_ob * slaw_h_stable<T>(z,L_ob,z0h) / (
135 );
136}
137
138template<typename T>
140 T l_lower,
141 T z,
142 T z0,
143 T z0h,
144 T fd_h,
145 T Pr)
146{
147 T up = -z/l_upper *
150 );
151
152 T low = z/l_lower *
155 );
156
157 return Pr*(up + low) / (2*fd_h);
158}
159
160/*
161* Similarity laws and corrections for the UNSTABLE (convective) regime:
162* REFERENCE: Dyer, A. J. (1974), A review of flux-profile relationships,
163* Bound.-Layer Meteorol., 7, 363-372.
164* INTEGRATION: Paulson, C. A. (1970), The mathematical representation
165* of wind speed and temperature profiles in the unstable atmospheric
166* surface layer, J. Appl. Meteorol., 9, 857-861.
167*/
168
169template<typename T>
171{
172 T zeta = z / L_ob;
173 T pi = 4*atan(1.0);
174 // Standard Dyer-Businger coefficient gamma = 16.0
175 T xi = sqrt(sqrt((1.0 - 16.0*zeta)));
176
177 return 2*log(0.5*(1 + xi)) + log(0.5*(1 + xi*xi)) - 2*atan(xi) + pi/2;
178}
179
180template<typename T>
182{
183 T zeta = z/L_ob;
184 T pi = 4*atan(1.0);
185 // Standard Dyer-Businger coefficient gamma = 16.0
186 T xi = sqrt(sqrt((1.0 - 16.0*zeta)));
187 return 2*log(0.5*(1 + xi*xi));
188}
189
190template<typename T>
192{
193 return log(z/z0)
196}
197
198template<typename T>
200{
201 return log(z/z0h)
204}
205
206template<typename T>
208{
209 return Ri_b - Pr*z/L_ob / (
213 );
214}
215
216template<typename T>
218 T l_lower,
219 T z,
220 T z0,
221 T z0h,
222 T fd_h,
223 T Pr)
224{
225 T up = -z/l_upper /
226 (
230 );
231
232 T low = z/l_lower /
233 (
237 );
238
239 return Pr*(up + low) / (2*fd_h);
240}
241
242template<typename T>
249
250template<typename T>
252 T l_lower,
253 T z,
254 T z0,
255 T z0h,
256 T fd_h,
257 T Pr)
258{
259 T up = -z/l_upper *
262 );
263
264 T low = z/l_lower *
267 );
268
269 return Pr*(up + low) / (2*fd_h);
270}
271
272/*
273* Similarity laws and corrections for the NEUTRAL regime:
274*/
275
276template<typename T>
278{
279 return log(z/z0);
280}
281
282template<typename T>
284{
285 return log(z/z0h);
286}
287
288/*
289 * CUDA kernel for the MOST wall model.
290 */
291template<typename T, int BC_TYPE>
293 const T* __restrict__ u_d,
294 const T* __restrict__ v_d,
295 const T* __restrict__ w_d,
296 const T* __restrict__ temp_d,
297 const T* __restrict__ h_d,
298 const T* __restrict__ n_x_d,
299 const T* __restrict__ n_y_d,
300 const T* __restrict__ n_z_d,
301 const int* __restrict__ ind_r_d,
302 const int* __restrict__ ind_s_d,
303 const int* __restrict__ ind_t_d,
304 const int* __restrict__ ind_e_d,
308 int n_nodes,
309 int lx,
310 T kappa,
311 const T * __restrict__ mu_w_d,
312 const T * __restrict__ rho_w_d,
313 T g1,
314 T g2,
315 T g3,
316 T Pr,
317 T z0,
318 T z0h_in,
319 T bc_value,
327 const int* __restrict__ h_x_idx,
328 const int* __restrict__ h_y_idx,
329 const int* __restrict__ h_z_idx
330) {
331 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
332 const int str = blockDim.x * gridDim.x;
333 if(idx >= n_nodes) return;
334
335 const T Ri_threshold = 1e-4;
336 const T tol = 1e-3;
337 const T NR_step = 1e-3;
338 const int max_iter = 50;
339
340 // Calculate the global 1D offset in the field arrays
341 // Layout: e is the slowest, r is the fastest
342 // Mapping GLL points from the full field to the boundary thread
343 // Neko provides 1-based indices from Fortran, so subtract 1.
344 for (int i = idx; i < n_nodes; i += str) {
345 const int index = (ind_e_d[i] - 1) * lx * lx * lx +
346 (ind_t_d[i] - 1) * lx * lx +
347 (ind_s_d[i] - 1) * lx +
348 (ind_r_d[i] - 1); // 'long long' might be needed to avoid
349 // overflowing 32-bits of 'int'
350 T ui = u_d[index];
351 T vi = v_d[index];
352 T wi = w_d[index];
353 T ti = temp_d[index];
354 T hi = h_d[i];
355 T mu = mu_w_d[i];
356 T rho = rho_w_d[i];
357
358 // Extract the local normal vector
359 T nx = n_x_d[i];
360 T ny = n_y_d[i];
361 T nz = n_z_d[i];
362
363 // Get the tangential component
364 T normu = ui * nx + vi * ny + wi * nz;
365 ui -= normu * nx;
366 vi -= normu * ny;
367 wi -= normu * nz;
368
369 // The magnitude used for MOST is the magnitude of the tangential vector
370 T magu = sqrt(ui*ui + vi*vi + wi*wi);
371 magu = fmax(magu, (T)1e-6); // Avoid division by zero in extreme cases
372
373 // Initial utau estimate
374 T utau = kappa * magu / log(hi/z0);
375
376 // Zilitinkevich 1995 correlation for thermal roughness
377 T z0h;
378 if (z0h_in < 0.0) {
379 z0h = z0 * exp(z0h_in * sqrt((utau*z0)/(mu/rho)));
380 } else {
381 z0h = z0h_in;
382 }
383
384 T ts = 0;
385 T q = 0;
386
387 if constexpr (BC_TYPE == 0) // Neumann
388 q = bc_value;
389 else // Dirichlet
390 {
391 ts = bc_value;
392 q = kappa/Pr*utau*(ts-ti)/log(hi/z0h);
393 }
394
395 T Ri_b;
396 T g_dot_n = fabs(g1*nx + g2*ny + g3*nz);
397 if constexpr (BC_TYPE == 0)
398 Ri_b = -g_dot_n*hi/ti*q*Pr/(magu*magu*magu*kappa*kappa);
399 else
400 Ri_b = g_dot_n*hi/ti*(ti-ts)/(magu*magu);
401
402 T L_ob = 0.0; // neutral default
403
404 const T L_sign = (Ri_b > 0) ? 1.0 : -1.0;
405
406 // Stability branching localy
407 if (fabs(Ri_b) <= Ri_threshold) {
408 // NEUTRAL CASe
409 utau = kappa * magu / slaw_m_neutral<T>(hi, z0);
410 if constexpr (BC_TYPE == 1) q = kappa/Pr * utau * (ts - ti) / slaw_h_neutral<T>(hi, z0h);
411 }
412 else {
413 // STABLE or CONVECTIVE (NR)
414 // Initial guess based on stability
415 if (Ri_b > 0)
417 else
419
420 T L_old;
421 for (int it = 0; it < max_iter; ++it) {
422 L_old = L_ob;
423 T f_val, dfdl;
424 T fd_h = NR_step * L_ob;
425 T L_upper = L_ob + fd_h;
426 T L_lower = L_ob - fd_h;
427
428 // Use the appropriate simlarity law based on stability and b.c. type
429 if (Ri_b > 0) { // Stable
430 if constexpr (BC_TYPE == 0) {
433 } else {
436 }
437 } else { // Convective
438 if constexpr (BC_TYPE == 0) {
441 } else {
444 }
445 }
446
447 if (fabs(dfdl) < (T)1e-12) break;
448 L_ob -= f_val / dfdl;
449 if (L_ob * L_sign <= 0) L_ob = 0.5 * L_old;
450 L_ob = L_sign * fmax(fmin(fabs(L_ob), (T)1e8), (T)1e-8);
451 if (fabs((L_ob - L_old) / L_ob) < tol) break;
452 }
453
454 // Final local variables update
455 if (Ri_b > 0) {
456 utau = kappa * magu / slaw_m_stable<T>(hi, L_ob, z0);
457 if constexpr (BC_TYPE == 1) q = kappa/Pr * utau * (ts - ti) / slaw_h_stable<T>(hi, L_ob, z0h);
458 } else {
459 utau = kappa * magu / slaw_m_convective<T>(hi, L_ob, z0);
460 if constexpr (BC_TYPE == 1) q = kappa/Pr * utau * (ts - ti) / slaw_h_convective<T>(hi, L_ob, z0h);
461 }
462 }
463 tau_x_d[i] = -rho*utau*utau*ui/magu;
464 tau_y_d[i] = -rho*utau*utau*vi/magu;
465 tau_z_d[i] = -rho*utau*utau*wi/magu;
466
467 const int index_ts = (ind_e_d[i] - 1) * lx * lx * lx +
468 (ind_t_d[i] - 1 - h_z_idx[i]) * lx * lx +
469 (ind_s_d[i] - 1 - h_y_idx[i]) * lx +
470 (ind_r_d[i] - 1 - h_x_idx[i]);
471 Ri_b_diagn[i] = Ri_b;
472 L_ob_diagn[i] = L_ob;
473 utau_diagn[i] = utau;
474 magu_diagn[i] = magu;
475 ti_diagn[i] = ti;
476 ts_diagn[i] = temp_d[index_ts];
477 q_diagn[i] = q;
478 }
479}
480
481#endif // MOST_KERNEL_H
const int i
const int e
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__device__ T slaw_m_neutral(T z, T z0)
__device__ T slaw_h_neutral(T z, T z0h)
__device__ T slaw_m_convective(T z, T L_ob, T z0)
__device__ T slaw_h_stable(T z, T L_ob, T z0h)
Definition most_kernel.h:87
__global__ void most_compute(const T *__restrict__ u_d, const T *__restrict__ v_d, const T *__restrict__ w_d, const T *__restrict__ temp_d, const T *__restrict__ h_d, const T *__restrict__ n_x_d, const T *__restrict__ n_y_d, const T *__restrict__ n_z_d, const int *__restrict__ ind_r_d, const int *__restrict__ ind_s_d, const int *__restrict__ ind_t_d, const int *__restrict__ ind_e_d, T *__restrict__ tau_x_d, T *__restrict__ tau_y_d, T *__restrict__ tau_z_d, int n_nodes, int lx, T kappa, const T *__restrict__ mu_w_d, const T *__restrict__ rho_w_d, T g1, T g2, T g3, T Pr, T z0, T z0h_in, T bc_value, T *__restrict__ Ri_b_diagn, T *__restrict__ L_ob_diagn, T *__restrict__ utau_diagn, T *__restrict__ magu_diagn, T *__restrict__ ti_diagn, T *__restrict__ ts_diagn, T *__restrict__ q_diagn, const int *__restrict__ h_x_idx, const int *__restrict__ h_y_idx, const int *__restrict__ h_z_idx)
__device__ T corr_h_convective(T z, T L_ob)
__device__ T corr_m_convective(T z, T L_ob)
__device__ T f_dirichlet_stable(T Ri_b, T z, T z0, T z0h, T L_ob, T Pr)
__device__ T corr_h_stable(T z, T L_ob)
Definition most_kernel.h:64
__device__ T slaw_m_stable(T z, T L_ob, T z0)
Definition most_kernel.h:79
__device__ T dfdl_neumann_convective(T l_upper, T l_lower, T z, T z0, T z0h, T fd_h, T Pr)
__device__ T dfdl_dirichlet_convective(T l_upper, T l_lower, T z, T z0, T z0h, T fd_h, T Pr)
__device__ T slaw_h_convective(T z, T L_ob, T z0h)
__device__ T f_neumann_stable(T Ri_b, T z, T z0, T z0h, T L_ob, T Pr)
Definition most_kernel.h:95
__device__ T f_neumann_convective(T Ri_b, T z, T z0, T z0h, T L_ob, T Pr)
__device__ T corr_m_stable(T z, T L_ob)
Definition most_kernel.h:51
__device__ T dfdl_neumann_stable(T l_upper, T l_lower, T z, T z0, T z0h, T fd_h, T Pr)
__device__ T f_dirichlet_convective(T Ri_b, T z, T z0, T z0h, T L_ob, T Pr)
__device__ T dfdl_dirichlet_stable(T l_upper, T l_lower, T z, T z0, T z0h, T fd_h, T Pr)