Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
local_interpolation.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2022-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include <stdio.h>
37#include <device/cuda/check.h>
38
42template< typename T, typename xT, const int LX, const int CHUNKS>
44 const T * pt_x,
45 const T * pt_y,
46 const T * pt_z,
47 const T * x_hat,
48 const T * y_hat,
49 const T * z_hat,
53 const int * el_ids,
54 const int n_pt,
55 T tol,
57
58 const int pt = blockIdx.x;
59 if (conv_pts[pt] < 0.5) return;
60 const int e = el_ids[pt];
61 const int elx3 = e*LX*LX*LX;
62 const int str = blockDim.y;
63 const int idx = threadIdx.y;
64
65 const xT one = 1.0;
66 const xT two = 2.0;
67 __shared__ xT dxdr, dydr, dzdr;
68 __shared__ xT dxds, dyds, dzds;
76
77
84
85 r_leg[0] = 1.0;
86 s_leg[0] = 1.0;
87 t_leg[0] = 1.0;
88 r_leg[1] = rst[3*pt];
89 s_leg[1] = rst[1+3*pt];
90 t_leg[1] = rst[2+3*pt];
91 dr_leg[0] = 0.0;
92 ds_leg[0] = 0.0;
93 dt_leg[0] = 0.0;
94 ds_leg[1] = 1.0;
95 dr_leg[1] = 1.0;
96 dt_leg[1] = 1.0;
97
98 for (int ii = 1; ii<LX-1; ii += 1) {
99 xT ir = ii;
100 r_leg[ii+1] = ((two*ir+one) * rst[3*pt] * r_leg[ii] - ir * r_leg[ii-1] ) / (ir+one);
101 s_leg[ii+1] = ((two*ir+one) * rst[3*pt+1] * s_leg[ii] - ir * s_leg[ii-1] ) / (ir+one);
102 t_leg[ii+1] = ((two*ir+one) * rst[3*pt+2] * t_leg[ii] - ir * t_leg[ii-1] ) / (ir+one);
103 dr_leg[ii+1] = (ir+one) * r_leg[ii] + rst[3*pt]*dr_leg[ii];
104 ds_leg[ii+1] = (ir+one) * s_leg[ii] + rst[3*pt+1]*ds_leg[ii];
105 dt_leg[ii+1] = (ir+one) * t_leg[ii] + rst[3*pt+2]*dt_leg[ii];
106 }
108 for (int ii = idx; ii< LX*LX; ii += str) {
109 xT xtmp = 0.0;
110 xT ytmp = 0.0;
111 xT ztmp = 0.0;
112 int j = ii;
113 int i = ii - j;
114 for( int l = 0; l < LX; l++){
115 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
116 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
117 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
118 }
119 xwork[ii] = xtmp;
120 ywork[ii] = ytmp;
121 zwork[ii] = ztmp;
122 }
123
125
126 for (int ijk = idx; ijk< LX; ijk += str) {
127 const int jk = ijk;
128 const int i = ijk - jk;
129 const int k = jk;
130 const int j = jk - k;
131 xT xtmp = 0.0;
132 xT ytmp = 0.0;
133 xT ztmp = 0.0;
134 const int ik2 = i + k*LX;
135 for( int l = 0; l < LX; l++){
136 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
137 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
138 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
139 }
140 xwork2[ijk] = xtmp;
141 ywork2[ijk] = ytmp;
142 zwork2[ijk] = ztmp;
143 }
144
145 if(idx==0){
146 const int ijk = idx;
147 const int jk = ijk;
148 const int i = ijk - jk;
149 const int k = jk ;
150 const int j = jk - k;
151 const int ij2 = i + j;
152 xT xtmp = 0.0;
153 xT ytmp = 0.0;
154 xT ztmp = 0.0;
155 for( int l = 0; l < LX; l++){
156 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
157 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
158 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
159 }
160 newx = xtmp;
161 newy = ytmp;
162 newz = ztmp;
163 //printf("gpu leg and xhat pt rst1 %lf, %lf, %lf, %d, %lf\n",r_leg[5], dr_leg[5],x_hat[elx3],pt, rst[3*pt]);
164 }
166 for (int ii = idx; ii< LX*LX; ii += str) {
167 xT xtmp = 0.0;
168 xT ytmp = 0.0;
169 xT ztmp = 0.0;
170 int j = ii;
171 int i = ii - j;
172 for( int l = 0; l < LX; l++){
173 xtmp += dr_leg[i+l]*x_hat[l+LX*j+elx3];
174 ytmp += dr_leg[i+l]*y_hat[l+LX*j+elx3];
175 ztmp += dr_leg[i+l]*z_hat[l+LX*j+elx3];
176 }
177 xwork[ii] = xtmp;
178 ywork[ii] = ytmp;
179 zwork[ii] = ztmp;
180 }
181
183
184 for (int ijk = idx; ijk< LX; ijk += str) {
185 const int jk = ijk;
186 const int i = ijk - jk;
187 const int k = jk;
188 const int j = jk - k;
189 xT xtmp = 0.0;
190 xT ytmp = 0.0;
191 xT ztmp = 0.0;
192 const int ik2 = i + k*LX;
193 for( int l = 0; l < LX; l++){
194 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
195 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
196 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
197 }
198 xwork2[ijk] = xtmp;
199 ywork2[ijk] = ytmp;
200 zwork2[ijk] = ztmp;
201 }
202
203
204 if (idx == 0) {
205 const int ijk = idx;
206 const int jk = ijk;
207 const int i = ijk - jk;
208 const int k = jk ;
209 const int j = jk - k;
210 xT xtmp = 0.0;
211 xT ytmp = 0.0;
212 xT ztmp = 0.0;
213 const int ij2 = i + j;
214 for( int l = 0; l < LX; l++){
215 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
216 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
217 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
218 }
219 dxdr = xtmp;
220 dydr = ytmp;
221 dzdr = ztmp;
222 }
223
225 for (int ii = idx; ii< LX*LX; ii += str) {
226 xT xtmp = 0.0;
227 xT ytmp = 0.0;
228 xT ztmp = 0.0;
229 int j = ii;
230 int i = ii - j;
231 for( int l = 0; l < LX; l++){
232 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
233 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
234 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
235 }
236 xwork[ii] = xtmp;
237 ywork[ii] = ytmp;
238 zwork[ii] = ztmp;
239 }
240
242
243 for (int ijk = idx; ijk< LX; ijk += str) {
244 const int jk = ijk;
245 const int i = ijk - jk;
246 const int k = jk;
247 const int j = jk - k;
248 xT xtmp = 0.0;
249 xT ytmp = 0.0;
250 xT ztmp = 0.0;
251 const int ik2 = i + k*LX;
252 for( int l = 0; l < LX; l++){
253 xtmp += ds_leg[l+j*LX]*xwork[l+ik2];
254 ytmp += ds_leg[l+j*LX]*ywork[l+ik2];
255 ztmp += ds_leg[l+j*LX]*zwork[l+ik2];
256 }
257 xwork2[ijk] = xtmp;
258 ywork2[ijk] = ytmp;
259 zwork2[ijk] = ztmp;
260 }
261
262
263 if (idx == 0) {
264 const int ijk = idx;
265 const int jk = ijk;
266 const int i = ijk - jk;
267 const int k = jk ;
268 const int j = jk - k;
269 xT xtmp = 0.0;
270 xT ytmp = 0.0;
271 xT ztmp = 0.0;
272 const int ij2 = i + j;
273 for( int l = 0; l < LX; l++){
274 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
275 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
276 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
277 }
278 dxds = xtmp;
279 dyds = ytmp;
280 dzds = ztmp;
281 }
282
284 for (int ii = idx; ii< LX*LX; ii += str) {
285 xT xtmp = 0.0;
286 xT ytmp = 0.0;
287 xT ztmp = 0.0;
288 int j = ii;
289 int i = ii - j;
290 for( int l = 0; l < LX; l++){
291 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
292 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
293 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
294 }
295 xwork[ii] = xtmp;
296 ywork[ii] = ytmp;
297 zwork[ii] = ztmp;
298 }
299
301
302 for (int ijk = idx; ijk< LX; ijk += str) {
303 const int jk = ijk;
304 const int i = ijk - jk;
305 const int k = jk;
306 const int j = jk - k;
307 xT xtmp = 0.0;
308 xT ytmp = 0.0;
309 xT ztmp = 0.0;
310 const int ik2 = i + k*LX;
311 for( int l = 0; l < LX; l++){
312 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
313 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
314 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
315 }
316 xwork2[ijk] = xtmp;
317 ywork2[ijk] = ytmp;
318 zwork2[ijk] = ztmp;
319 }
320
322
323 if( idx == 0){
324 xT xtmp = 0.0;
325 xT ytmp = 0.0;
326 xT ztmp = 0.0;
327 for( int l = 0; l < LX; l++){
328 xtmp += dt_leg[l]*xwork2[l];
329 ytmp += dt_leg[l]*ywork2[l];
330 ztmp += dt_leg[l]*zwork2[l];
331 }
332 xT jacdet;
334 xT drdx, drdy, drdz;
335 xT dsdx, dsdy, dsdz;
336 xT dtdx, dtdy, dtdz;
337 xT dxdt, dydt, dzdt;
338 xT rstd[3];
339 xT rstdiff;
340 xT tol2;
342
343 dxdt = xtmp;
344 dydt = ytmp;
345 dzdt = ztmp;
346 //printf("rlegdrleg %lf %lf %f \n",r_leg[5], dr_leg[5],x_hat[elx3]);
347
348 jacdet = (dxdr * dyds * dzdt)
349 + (dxdt * dydr * dzds)
350 + (dxds * dydt * dzdr)
351 - (dxdr * dydt * dzds)
352 - (dxds * dydr * dzdt)
353 - (dxdt * dyds * dzdr);
354
355 jacdetinv = one / jacdet;
356 //check legendre polynoimial, rst, and xhat
357
358 drdx =(dyds*dzdt - dydt*dzds);
359 drdy =(dxdt*dzds - dxds*dzdt);
360 drdz =(dxds*dydt - dxdt*dyds);
361 dsdx =(dydt*dzdr - dydr*dzdt);
362 dsdy =(dxdr*dzdt - dxdt*dzdr);
363 dsdz =(dxdt*dydr - dxdr*dydt);
364 dtdx =(dydr*dzds - dyds*dzdr);
365 dtdy =(dxds*dzdr - dxdr*dzds);
366 dtdz =(dxdr*dyds - dxds*dydr);
367 //printf("newx gpu %lf \n",newx);
368 resx[pt] = pt_x[pt]-newx;
369 resy[pt] = pt_y[pt]-newy;
370 resz[pt] = pt_z[pt]-newz;
371
375
376 rst[3*pt] += rstd[0];
377 rst[3*pt + 1] += rstd[1];
378 rst[3*pt + 2] += rstd[2];
379 rstdiff = rstd[0]*rstd[0]+rstd[1]*rstd[1]+rstd[2]*rstd[2];
380 conv_pts[pt] = 1.0;
381 tol2 = tol*tol;
382 if (rstdiff <= tol2) conv_pts[pt] = 0.0;
383 if (rstdiff > 12.0) conv_pts[pt] = 0.0;
384 //leg_outside = LX*LX*(LX-one)*(LX-one)/4.0;
385 //if(r_leg[LX-1]*r_leg[LX-1] > leg_outside) conv_pts[pt] = true;
386 //if(s_leg[LX-1]*s_leg[LX-1] > leg_outside) conv_pts[pt] = true;
387 //if(t_leg[LX-1]*t_leg[LX-1] > leg_outside) conv_pts[pt] = true;
388
389 }
390}
391
392
393extern "C" {
394
399 void *pt_x, void* pt_y, void* pt_z,
400 void *x_hat, void *y_hat, void *z_hat,
401 void *resx, void *resy, void *resz,
402 int *lx, void *el_ids, int *n_pt, real *tol,
403 void *conv_pts) {
404
405 const dim3 nthrds(1, 128, 1);
406 const dim3 nblcks((*n_pt), 1, 1);
407 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
408
409#define RST_CASE(LX) \
410 case LX: \
411 find_rst_legendre_kernel<real,real_xp, LX, 128> \
412 <<<nblcks, nthrds, 0, stream>>> \
413 ((real *) rst,(real *) pt_x, (real *) pt_y, (real *) pt_z, \
414 (real *) x_hat, (real *) y_hat, (real *) z_hat, \
415 (real *) resx, (real *) resy, (real *) resz, (int *) el_ids, \
416 *n_pt, *tol, (real *) conv_pts); \
417 CUDA_CHECK(cudaGetLastError()); \
418 break
419
420 switch(*lx) {
421 RST_CASE(2);
422 RST_CASE(3);
423 RST_CASE(4);
424 RST_CASE(5);
425 RST_CASE(6);
426 RST_CASE(7);
427 RST_CASE(8);
428 RST_CASE(9);
429 RST_CASE(10);
430 RST_CASE(11);
431 RST_CASE(12);
432 RST_CASE(13);
433 RST_CASE(14);
434 RST_CASE(15);
435 RST_CASE(16);
436 default:
437 {
438 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
439 exit(1);
440 }
441 }
442 }
443}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
const int j
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define RST_CASE(LX)
__global__ void find_rst_legendre_kernel(T *__restrict__ rst, const T *pt_x, const T *pt_y, const T *pt_z, const T *x_hat, const T *y_hat, const T *z_hat, T *__restrict__ resx, T *__restrict__ resy, T *__restrict__ resz, const int *el_ids, const int n_pt, T tol, T *__restrict__ conv_pts)
void cuda_find_rst_legendre(void *rst, void *pt_x, void *pt_y, void *pt_z, void *x_hat, void *y_hat, void *z_hat, void *resx, void *resy, void *resz, int *lx, void *el_ids, int *n_pt, real *tol, void *conv_pts)