Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
local_interpolation.hip
Go to the documentation of this file.
1/*
2 Copyright (c) 2022-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35
36#include <stdio.h>
37#include <hip/hip_runtime.h>
39#include <device/hip/check.h>
40
44template< typename T, typename xT, const int LX, const int CHUNKS>
46 const T * pt_x,
47 const T * pt_y,
48 const T * pt_z,
49 const T * x_hat,
50 const T * y_hat,
51 const T * z_hat,
55 const int * el_ids,
56 const int n_pt,
57 T tol,
59
60 const int pt = blockIdx.x;
61 if (conv_pts[pt] < 0.5) return;
62 const int e = el_ids[pt];
63 const int elx3 = e*LX*LX*LX;
64 const int str = blockDim.y;
65 const int idx = threadIdx.y;
66
67 const xT one = 1.0;
68 const xT two = 2.0;
69 __shared__ xT dxdr, dydr, dzdr;
70 __shared__ xT dxds, dyds, dzds;
78
79
86
87 r_leg[0] = 1.0;
88 s_leg[0] = 1.0;
89 t_leg[0] = 1.0;
90 r_leg[1] = rst[3*pt];
91 s_leg[1] = rst[1+3*pt];
92 t_leg[1] = rst[2+3*pt];
93 dr_leg[0] = 0.0;
94 ds_leg[0] = 0.0;
95 dt_leg[0] = 0.0;
96 ds_leg[1] = 1.0;
97 dr_leg[1] = 1.0;
98 dt_leg[1] = 1.0;
99
100 for (int ii = 1; ii<LX-1; ii += 1) {
101 xT ir = ii;
102 r_leg[ii+1] = ((two*ir+one) * rst[3*pt] * r_leg[ii] - ir * r_leg[ii-1] ) / (ir+one);
103 s_leg[ii+1] = ((two*ir+one) * rst[3*pt+1] * s_leg[ii] - ir * s_leg[ii-1] ) / (ir+one);
104 t_leg[ii+1] = ((two*ir+one) * rst[3*pt+2] * t_leg[ii] - ir * t_leg[ii-1] ) / (ir+one);
105 dr_leg[ii+1] = (ir+one) * r_leg[ii] + rst[3*pt]*dr_leg[ii];
106 ds_leg[ii+1] = (ir+one) * s_leg[ii] + rst[3*pt+1]*ds_leg[ii];
107 dt_leg[ii+1] = (ir+one) * t_leg[ii] + rst[3*pt+2]*dt_leg[ii];
108 }
110 for (int ii = idx; ii< LX*LX; ii += str) {
111 xT xtmp = 0.0;
112 xT ytmp = 0.0;
113 xT ztmp = 0.0;
114 int j = ii;
115 int i = ii - j;
116 for( int l = 0; l < LX; l++){
117 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
118 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
119 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
120 }
121 xwork[ii] = xtmp;
122 ywork[ii] = ytmp;
123 zwork[ii] = ztmp;
124 }
125
127
128 for (int ijk = idx; ijk< LX; ijk += str) {
129 const int jk = ijk;
130 const int i = ijk - jk;
131 const int k = jk;
132 const int j = jk - k;
133 xT xtmp = 0.0;
134 xT ytmp = 0.0;
135 xT ztmp = 0.0;
136 const int ik2 = i + k*LX;
137 for( int l = 0; l < LX; l++){
138 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
139 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
140 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
141 }
142 xwork2[ijk] = xtmp;
143 ywork2[ijk] = ytmp;
144 zwork2[ijk] = ztmp;
145 }
146
147 if(idx==0){
148 const int ijk = idx;
149 const int jk = ijk;
150 const int i = ijk - jk;
151 const int k = jk ;
152 const int j = jk - k;
153 const int ij2 = i + j;
154 xT xtmp = 0.0;
155 xT ytmp = 0.0;
156 xT ztmp = 0.0;
157 for( int l = 0; l < LX; l++){
158 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
159 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
160 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
161 }
162 newx = xtmp;
163 newy = ytmp;
164 newz = ztmp;
165 //printf("gpu leg and xhat pt rst1 %lf, %lf, %lf, %d, %lf\n",r_leg[5], dr_leg[5],x_hat[elx3],pt, rst[3*pt]);
166 }
168 for (int ii = idx; ii< LX*LX; ii += str) {
169 xT xtmp = 0.0;
170 xT ytmp = 0.0;
171 xT ztmp = 0.0;
172 int j = ii;
173 int i = ii - j;
174 for( int l = 0; l < LX; l++){
175 xtmp += dr_leg[i+l]*x_hat[l+LX*j+elx3];
176 ytmp += dr_leg[i+l]*y_hat[l+LX*j+elx3];
177 ztmp += dr_leg[i+l]*z_hat[l+LX*j+elx3];
178 }
179 xwork[ii] = xtmp;
180 ywork[ii] = ytmp;
181 zwork[ii] = ztmp;
182 }
183
185
186 for (int ijk = idx; ijk< LX; ijk += str) {
187 const int jk = ijk;
188 const int i = ijk - jk;
189 const int k = jk;
190 const int j = jk - k;
191 xT xtmp = 0.0;
192 xT ytmp = 0.0;
193 xT ztmp = 0.0;
194 const int ik2 = i + k*LX;
195 for( int l = 0; l < LX; l++){
196 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
197 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
198 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
199 }
200 xwork2[ijk] = xtmp;
201 ywork2[ijk] = ytmp;
202 zwork2[ijk] = ztmp;
203 }
204
205
206 if (idx == 0) {
207 const int ijk = idx;
208 const int jk = ijk;
209 const int i = ijk - jk;
210 const int k = jk ;
211 const int j = jk - k;
212 xT xtmp = 0.0;
213 xT ytmp = 0.0;
214 xT ztmp = 0.0;
215 const int ij2 = i + j;
216 for( int l = 0; l < LX; l++){
217 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
218 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
219 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
220 }
221 dxdr = xtmp;
222 dydr = ytmp;
223 dzdr = ztmp;
224 }
225
227 for (int ii = idx; ii< LX*LX; ii += str) {
228 xT xtmp = 0.0;
229 xT ytmp = 0.0;
230 xT ztmp = 0.0;
231 int j = ii;
232 int i = ii - j;
233 for( int l = 0; l < LX; l++){
234 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
235 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
236 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
237 }
238 xwork[ii] = xtmp;
239 ywork[ii] = ytmp;
240 zwork[ii] = ztmp;
241 }
242
244
245 for (int ijk = idx; ijk< LX; ijk += str) {
246 const int jk = ijk;
247 const int i = ijk - jk;
248 const int k = jk;
249 const int j = jk - k;
250 xT xtmp = 0.0;
251 xT ytmp = 0.0;
252 xT ztmp = 0.0;
253 const int ik2 = i + k*LX;
254 for( int l = 0; l < LX; l++){
255 xtmp += ds_leg[l+j*LX]*xwork[l+ik2];
256 ytmp += ds_leg[l+j*LX]*ywork[l+ik2];
257 ztmp += ds_leg[l+j*LX]*zwork[l+ik2];
258 }
259 xwork2[ijk] = xtmp;
260 ywork2[ijk] = ytmp;
261 zwork2[ijk] = ztmp;
262 }
263
264
265 if (idx == 0) {
266 const int ijk = idx;
267 const int jk = ijk;
268 const int i = ijk - jk;
269 const int k = jk ;
270 const int j = jk - k;
271 xT xtmp = 0.0;
272 xT ytmp = 0.0;
273 xT ztmp = 0.0;
274 const int ij2 = i + j;
275 for( int l = 0; l < LX; l++){
276 xtmp += t_leg[l+k*LX]*xwork2[ij2 + l];
277 ytmp += t_leg[l+k*LX]*ywork2[ij2 + l];
278 ztmp += t_leg[l+k*LX]*zwork2[ij2 + l];
279 }
280 dxds = xtmp;
281 dyds = ytmp;
282 dzds = ztmp;
283 }
284
286 for (int ii = idx; ii< LX*LX; ii += str) {
287 xT xtmp = 0.0;
288 xT ytmp = 0.0;
289 xT ztmp = 0.0;
290 int j = ii;
291 int i = ii - j;
292 for( int l = 0; l < LX; l++){
293 xtmp += r_leg[i+l]*x_hat[l+LX*j+elx3];
294 ytmp += r_leg[i+l]*y_hat[l+LX*j+elx3];
295 ztmp += r_leg[i+l]*z_hat[l+LX*j+elx3];
296 }
297 xwork[ii] = xtmp;
298 ywork[ii] = ytmp;
299 zwork[ii] = ztmp;
300 }
301
303
304 for (int ijk = idx; ijk< LX; ijk += str) {
305 const int jk = ijk;
306 const int i = ijk - jk;
307 const int k = jk;
308 const int j = jk - k;
309 xT xtmp = 0.0;
310 xT ytmp = 0.0;
311 xT ztmp = 0.0;
312 const int ik2 = i + k*LX;
313 for( int l = 0; l < LX; l++){
314 xtmp += s_leg[l+j*LX]*xwork[l+ik2];
315 ytmp += s_leg[l+j*LX]*ywork[l+ik2];
316 ztmp += s_leg[l+j*LX]*zwork[l+ik2];
317 }
318 xwork2[ijk] = xtmp;
319 ywork2[ijk] = ytmp;
320 zwork2[ijk] = ztmp;
321 }
322
324
325 if( idx == 0){
326 xT xtmp = 0.0;
327 xT ytmp = 0.0;
328 xT ztmp = 0.0;
329 for( int l = 0; l < LX; l++){
330 xtmp += dt_leg[l]*xwork2[l];
331 ytmp += dt_leg[l]*ywork2[l];
332 ztmp += dt_leg[l]*zwork2[l];
333 }
334 xT jacdet;
336 xT drdx, drdy, drdz;
337 xT dsdx, dsdy, dsdz;
338 xT dtdx, dtdy, dtdz;
339 xT dxdt, dydt, dzdt;
340 xT rstd[3];
341 xT rstdiff;
342 xT tol2;
344
345 dxdt = xtmp;
346 dydt = ytmp;
347 dzdt = ztmp;
348 //printf("rlegdrleg %lf %lf %f \n",r_leg[5], dr_leg[5],x_hat[elx3]);
349
350 jacdet = (dxdr * dyds * dzdt)
351 + (dxdt * dydr * dzds)
352 + (dxds * dydt * dzdr)
353 - (dxdr * dydt * dzds)
354 - (dxds * dydr * dzdt)
355 - (dxdt * dyds * dzdr);
356
357 jacdetinv = one / jacdet;
358 //check legendre polynoimial, rst, and xhat
359
360 drdx =(dyds*dzdt - dydt*dzds);
361 drdy =(dxdt*dzds - dxds*dzdt);
362 drdz =(dxds*dydt - dxdt*dyds);
363 dsdx =(dydt*dzdr - dydr*dzdt);
364 dsdy =(dxdr*dzdt - dxdt*dzdr);
365 dsdz =(dxdt*dydr - dxdr*dydt);
366 dtdx =(dydr*dzds - dyds*dzdr);
367 dtdy =(dxds*dzdr - dxdr*dzds);
368 dtdz =(dxdr*dyds - dxds*dydr);
369 //printf("newx gpu %lf \n",newx);
370 resx[pt] = pt_x[pt]-newx;
371 resy[pt] = pt_y[pt]-newy;
372 resz[pt] = pt_z[pt]-newz;
373
377
378 rst[3*pt] += rstd[0];
379 rst[3*pt + 1] += rstd[1];
380 rst[3*pt + 2] += rstd[2];
381 rstdiff = rstd[0]*rstd[0]+rstd[1]*rstd[1]+rstd[2]*rstd[2];
382 conv_pts[pt] = 1.0;
383 tol2 = tol*tol;
384 if (rstdiff <= tol2) conv_pts[pt] = 0.0;
385 if (rstdiff > 12.0) conv_pts[pt] = 0.0;
386 //leg_outside = LX*LX*(LX-one)*(LX-one)/4.0;
387 //if(r_leg[LX-1]*r_leg[LX-1] > leg_outside) conv_pts[pt] = true;
388 //if(s_leg[LX-1]*s_leg[LX-1] > leg_outside) conv_pts[pt] = true;
389 //if(t_leg[LX-1]*t_leg[LX-1] > leg_outside) conv_pts[pt] = true;
390
391 }
392}
393
394
395extern "C" {
396
400 void hip_find_rst_legendre(void *rst,
401 void *pt_x, void* pt_y, void* pt_z,
402 void *x_hat, void *y_hat, void *z_hat,
403 void *resx, void *resy, void *resz,
404 int *lx, void *el_ids, int *n_pt, real *tol,
405 void *conv_pts) {
406
407 const dim3 nthrds(1, 128, 1);
408 const dim3 nblcks((*n_pt), 1, 1);
409 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
410
411#define RST_CASE(LX) \
412 case LX: \
413 hipLaunchKernelGGL( \
414 HIP_KERNEL_NAME(find_rst_legendre_kernel<real,real_xp, LX, 128>), \
415 nblcks, nthrds, 0, stream, \
416 (real *) rst,(real *) pt_x, (real *) pt_y, (real *) pt_z, \
417 (real *) x_hat, (real *) y_hat, (real *) z_hat, \
418 (real *) resx, (real *) resy, (real *) resz, (int *) el_ids, \
419 *n_pt, *tol, (real *) conv_pts); \
420 HIP_CHECK(hipGetLastError()); \
421 break
422
423 switch(*lx) {
424 RST_CASE(2);
425 RST_CASE(3);
426 RST_CASE(4);
427 RST_CASE(5);
428 RST_CASE(6);
429 RST_CASE(7);
430 RST_CASE(8);
431 RST_CASE(9);
432 RST_CASE(10);
433 RST_CASE(11);
434 RST_CASE(12);
435 RST_CASE(13);
436 RST_CASE(14);
437 RST_CASE(15);
438 RST_CASE(16);
439 default:
440 {
441 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
442 exit(1);
443 }
444 }
445 }
446}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
const int j
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void hip_find_rst_legendre(void *rst, void *pt_x, void *pt_y, void *pt_z, void *x_hat, void *y_hat, void *z_hat, void *resx, void *resy, void *resz, int *lx, void *el_ids, int *n_pt, real *tol, void *conv_pts)
#define RST_CASE(LX)
__global__ void find_rst_legendre_kernel(T *__restrict__ rst, const T *pt_x, const T *pt_y, const T *pt_z, const T *x_hat, const T *y_hat, const T *z_hat, T *__restrict__ resx, T *__restrict__ resy, T *__restrict__ resz, const int *el_ids, const int n_pt, T tol, T *__restrict__ conv_pts)