Neko  0.8.99
A portable framework for high-order spectral element flow simulations
ax_helm_full_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_AX_HELM_FULL_KERNEL_H__
2 #define __MATH_AX_HELM_FULL_KERNEL_H__
3 /*
4  Copyright (c) 2024, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX >
42 __global__ void __launch_bounds__(LX*LX,3)
43  ax_helm_stress_kernel_vector_kstep(T * __restrict__ au,
44  T * __restrict__ av,
45  T * __restrict__ aw,
46  const T * __restrict__ u,
47  const T * __restrict__ v,
48  const T * __restrict__ w,
49  const T * __restrict__ dx,
50  const T * __restrict__ dy,
51  const T * __restrict__ dz,
52  const T * __restrict__ h1,
53  const T * __restrict__ drdx,
54  const T * __restrict__ drdy,
55  const T * __restrict__ drdz,
56  const T * __restrict__ dsdx,
57  const T * __restrict__ dsdy,
58  const T * __restrict__ dsdz,
59  const T * __restrict__ dtdx,
60  const T * __restrict__ dtdy,
61  const T * __restrict__ dtdz,
62  const T * __restrict__ jacinv,
63  const T * __restrict__ weight3) {
64 
65  __shared__ T shdx[LX * LX];
66  __shared__ T shdy[LX * LX];
67  __shared__ T shdz[LX * LX];
68 
69  __shared__ T shu[LX * LX];
70  __shared__ T shur[LX * LX];
71  __shared__ T shus[LX * LX];
72 
73  __shared__ T shv[LX * LX];
74  __shared__ T shvr[LX * LX];
75  __shared__ T shvs[LX * LX];
76 
77  __shared__ T shw[LX * LX];
78  __shared__ T shwr[LX * LX];
79  __shared__ T shws[LX * LX];
80 
81  T ru[LX];
82  T rv[LX];
83  T rw[LX];
84 
85  T ruw[LX];
86  T rvw[LX];
87  T rww[LX];
88 
89  T rut;
90  T rvt;
91  T rwt;
92 
93  const int e = blockIdx.x;
94  const int j = threadIdx.y;
95  const int i = threadIdx.x;
96  const int ij = i + j*LX;
97  const int ele = e*LX*LX*LX;
98 
99  shdx[ij] = dx[ij];
100  shdy[ij] = dy[ij];
101  shdz[ij] = dz[ij];
102 
103 #pragma unroll
104  for(int k = 0; k < LX; ++k){
105  ru[k] = u[ij + k*LX*LX + ele];
106  ruw[k] = 0.0;
107 
108  rv[k] = v[ij + k*LX*LX + ele];
109  rvw[k] = 0.0;
110 
111  rw[k] = w[ij + k*LX*LX + ele];
112  rww[k] = 0.0;
113  }
114 
115 
117 #pragma unroll
118  for (int k = 0; k < LX; ++k){
119  const int ijk = ij + k*LX*LX;
120  const T drdx_local = drdx[ijk+ele];
121  const T drdy_local = drdy[ijk+ele];
122  const T drdz_local = drdz[ijk+ele];
123  const T dsdx_local = dsdx[ijk+ele];
124  const T dsdy_local = dsdy[ijk+ele];
125  const T dsdz_local = dsdz[ijk+ele];
126  const T dtdx_local = dtdx[ijk+ele];
127  const T dtdy_local = dtdy[ijk+ele];
128  const T dtdz_local = dtdz[ijk+ele];
129  const T dj = h1[ijk+ele] *
130  weight3[ijk] *
131  jacinv[ijk+ele];
132 
133  T uttmp = 0.0;
134  T vttmp = 0.0;
135  T wttmp = 0.0;
136  shu[ij] = ru[k];
137  shv[ij] = rv[k];
138  shw[ij] = rw[k];
139  for (int l = 0; l < LX; l++){
140  uttmp += shdz[k+l*LX] * ru[l];
141  vttmp += shdz[k+l*LX] * rv[l];
142  wttmp += shdz[k+l*LX] * rw[l];
143  }
144  __syncthreads();
145 
146  T urtmp = 0.0;
147  T ustmp = 0.0;
148 
149  T vrtmp = 0.0;
150  T vstmp = 0.0;
151 
152  T wrtmp = 0.0;
153  T wstmp = 0.0;
154 #pragma unroll
155  for (int l = 0; l < LX; l++){
156  urtmp += shdx[i+l*LX] * shu[l+j*LX];
157  ustmp += shdy[j+l*LX] * shu[i+l*LX];
158 
159  vrtmp += shdx[i+l*LX] * shv[l+j*LX];
160  vstmp += shdy[j+l*LX] * shv[i+l*LX];
161 
162  wrtmp += shdx[i+l*LX] * shw[l+j*LX];
163  wstmp += shdy[j+l*LX] * shw[i+l*LX];
164  }
165 
166  T u1 = 0.0;
167  T u2 = 0.0;
168  T u3 = 0.0;
169  T v1 = 0.0;
170  T v2 = 0.0;
171  T v3 = 0.0;
172  T w1 = 0.0;
173  T w2 = 0.0;
174  T w3 = 0.0;
175 
176  u1 = urtmp * drdx_local +
177  ustmp * dsdx_local +
178  uttmp * dtdx_local;
179  u2 = urtmp * drdy_local +
180  ustmp * dsdy_local +
181  uttmp * dtdy_local;
182  u3 = urtmp * drdz_local +
183  ustmp * dsdz_local +
184  uttmp * dtdz_local;
185 
186  v1 = vrtmp * drdx_local +
187  vstmp * dsdx_local +
188  vttmp * dtdx_local;
189  v2 = vrtmp * drdy_local +
190  vstmp * dsdy_local +
191  vttmp * dtdy_local;
192  v3 = vrtmp * drdz_local +
193  vstmp * dsdz_local +
194  vttmp * dtdz_local;
195 
196  w1 = wrtmp * drdx_local +
197  wstmp * dsdx_local +
198  wttmp * dtdx_local;
199  w2 = wrtmp * drdy_local +
200  wstmp * dsdy_local +
201  wttmp * dtdy_local;
202  w3 = wrtmp * drdz_local +
203  wstmp * dsdz_local +
204  wttmp * dtdz_local;
205 
206  T s11 = 0.0;
207  T s12 = 0.0;
208  T s13 = 0.0;
209  T s21 = 0.0;
210  T s22 = 0.0;
211  T s23 = 0.0;
212  T s31 = 0.0;
213  T s32 = 0.0;
214  T s33 = 0.0;
215 
216  s11 = dj*(u1 + u1);
217  s12 = dj*(u2 + v1);
218  s13 = dj*(u3 + w1);
219  s21 = dj*(v1 + u2);
220  s22 = dj*(v2 + v2);
221  s23 = dj*(v3 + w2);
222  s31 = dj*(w1 + u3);
223  s32 = dj*(w2 + v3);
224  s33 = dj*(w3 + w3);
225 
226  shur[ij] = drdx_local * s11 +
227  drdy_local * s12 +
228  drdz_local * s13;
229  shus[ij] = dsdx_local * s11 +
230  dsdy_local * s12 +
231  dsdz_local * s13;
232  rut = dtdx_local * s11 +
233  dtdy_local * s12 +
234  dtdz_local * s13;
235 
236  shvr[ij] = drdx_local * s21 +
237  drdy_local * s22 +
238  drdz_local * s23;
239  shvs[ij] = dsdx_local * s21 +
240  dsdy_local * s22 +
241  dsdz_local * s23;
242  rvt = dtdx_local * s21 +
243  dtdy_local * s22 +
244  dtdz_local * s23;
245 
246  shwr[ij] = drdx_local * s31 +
247  drdy_local * s32 +
248  drdz_local * s33;
249  shws[ij] = dsdx_local * s31 +
250  dsdy_local * s32 +
251  dsdz_local * s33;
252  rwt = dtdx_local * s31 +
253  dtdy_local * s32 +
254  dtdz_local * s33;
255 
256  __syncthreads();
257 
258  T uwijke = 0.0;
259  T vwijke = 0.0;
260  T wwijke = 0.0;
261 #pragma unroll
262  for (int l = 0; l < LX; l++){
263  uwijke += shur[l+j*LX] * shdx[l+i*LX];
264  ruw[l] += rut * shdz[k+l*LX];
265  uwijke += shus[i+l*LX] * shdy[l + j*LX];
266 
267  vwijke += shvr[l+j*LX] * shdx[l+i*LX];
268  rvw[l] += rvt * shdz[k+l*LX];
269  vwijke += shvs[i+l*LX] * shdy[l + j*LX];
270 
271  wwijke += shwr[l+j*LX] * shdx[l+i*LX];
272  rww[l] += rwt * shdz[k+l*LX];
273  wwijke += shws[i+l*LX] * shdy[l + j*LX];
274  }
275  ruw[k] += uwijke;
276  rvw[k] += vwijke;
277  rww[k] += wwijke;
278  }
279 #pragma unroll
280  for (int k = 0; k < LX; ++k){
281  au[ij + k*LX*LX + ele] = ruw[k];
282  av[ij + k*LX*LX + ele] = rvw[k];
283  aw[ij + k*LX*LX + ele] = rww[k];
284  }
285 }
286 
287 template< typename T, const int LX >
288 __global__ void __launch_bounds__(LX*LX,3)
289  ax_helm_stress_kernel_vector_kstep_padded(T * __restrict__ au,
290  T * __restrict__ av,
291  T * __restrict__ aw,
292  const T * __restrict__ u,
293  const T * __restrict__ v,
294  const T * __restrict__ w,
295  const T * __restrict__ dx,
296  const T * __restrict__ dy,
297  const T * __restrict__ dz,
298  const T * __restrict__ h1,
299  const T * __restrict__ drdx,
300  const T * __restrict__ drdy,
301  const T * __restrict__ drdz,
302  const T * __restrict__ dsdx,
303  const T * __restrict__ dsdy,
304  const T * __restrict__ dsdz,
305  const T * __restrict__ dtdx,
306  const T * __restrict__ dtdy,
307  const T * __restrict__ dtdz,
308  const T * __restrict__ jacinv,
309  const T * __restrict__ weight3) {
310 
311  __shared__ T shdx[LX * (LX+1)];
312  __shared__ T shdy[LX * (LX+1)];
313  __shared__ T shdz[LX * (LX+1)];
314 
315  __shared__ T shu[LX * (LX+1)];
316  __shared__ T shur[LX * LX];
317  __shared__ T shus[LX * (LX+1)];
318 
319  __shared__ T shv[LX * (LX+1)];
320  __shared__ T shvr[LX * LX];
321  __shared__ T shvs[LX * (LX+1)];
322 
323  __shared__ T shw[LX * (LX+1)];
324  __shared__ T shwr[LX * LX];
325  __shared__ T shws[LX * (LX+1)];
326 
327  T ru[LX];
328  T rv[LX];
329  T rw[LX];
330 
331  T ruw[LX];
332  T rvw[LX];
333  T rww[LX];
334 
335  T rut;
336  T rvt;
337  T rwt;
338 
339  const int e = blockIdx.x;
340  const int j = threadIdx.y;
341  const int i = threadIdx.x;
342  const int ij = i + j*LX;
343  const int ij_p = i + j*(LX+1);
344  const int ele = e*LX*LX*LX;
345 
346  shdx[ij_p] = dx[ij];
347  shdy[ij_p] = dy[ij];
348  shdz[ij_p] = dz[ij];
349 
350 #pragma unroll
351  for(int k = 0; k < LX; ++k){
352  ru[k] = u[ij + k*LX*LX + ele];
353  ruw[k] = 0.0;
354 
355  rv[k] = v[ij + k*LX*LX + ele];
356  rvw[k] = 0.0;
357 
358  rw[k] = w[ij + k*LX*LX + ele];
359  rww[k] = 0.0;
360  }
361 
362 
363  __syncthreads();
364 #pragma unroll
365  for (int k = 0; k < LX; ++k){
366  const int ijk = ij + k*LX*LX;
367  const T drdx_local = drdx[ijk+ele];
368  const T drdy_local = drdy[ijk+ele];
369  const T drdz_local = drdz[ijk+ele];
370  const T dsdx_local = dsdx[ijk+ele];
371  const T dsdy_local = dsdy[ijk+ele];
372  const T dsdz_local = dsdz[ijk+ele];
373  const T dtdx_local = dtdx[ijk+ele];
374  const T dtdy_local = dtdy[ijk+ele];
375  const T dtdz_local = dtdz[ijk+ele];
376  const T dj = h1[ijk+ele] *
377  weight3[ijk] *
378  jacinv[ijk+ele];
379 
380  T uttmp = 0.0;
381  T vttmp = 0.0;
382  T wttmp = 0.0;
383  shu[ij_p] = ru[k];
384  shv[ij_p] = rv[k];
385  shw[ij_p] = rw[k];
386  for (int l = 0; l < LX; l++){
387  uttmp += shdz[k+l*(LX+1)] * ru[l];
388  vttmp += shdz[k+l*(LX+1)] * rv[l];
389  wttmp += shdz[k+l*(LX+1)] * rw[l];
390  }
391  __syncthreads();
392 
393  T urtmp = 0.0;
394  T ustmp = 0.0;
395 
396  T vrtmp = 0.0;
397  T vstmp = 0.0;
398 
399  T wrtmp = 0.0;
400  T wstmp = 0.0;
401 #pragma unroll
402  for (int l = 0; l < LX; l++){
403  urtmp += shdx[i+l*(LX+1)] * shu[l+j*(LX+1)];
404  ustmp += shdy[j+l*(LX+1)] * shu[i+l*(LX+1)];
405 
406  vrtmp += shdx[i+l*(LX+1)] * shv[l+j*(LX+1)];
407  vstmp += shdy[j+l*(LX+1)] * shv[i+l*(LX+1)];
408 
409  wrtmp += shdx[i+l*(LX+1)] * shw[l+j*(LX+1)];
410  wstmp += shdy[j+l*(LX+1)] * shw[i+l*(LX+1)];
411  }
412 
413  T u1 = 0.0;
414  T u2 = 0.0;
415  T u3 = 0.0;
416  T v1 = 0.0;
417  T v2 = 0.0;
418  T v3 = 0.0;
419  T w1 = 0.0;
420  T w2 = 0.0;
421  T w3 = 0.0;
422 
423  u1 = urtmp * drdx_local +
424  ustmp * dsdx_local +
425  uttmp * dtdx_local;
426  u2 = urtmp * drdy_local +
427  ustmp * dsdy_local +
428  uttmp * dtdy_local;
429  u3 = urtmp * drdz_local +
430  ustmp * dsdz_local +
431  uttmp * dtdz_local;
432 
433  v1 = vrtmp * drdx_local +
434  vstmp * dsdx_local +
435  vttmp * dtdx_local;
436  v2 = vrtmp * drdy_local +
437  vstmp * dsdy_local +
438  vttmp * dtdy_local;
439  v3 = vrtmp * drdz_local +
440  vstmp * dsdz_local +
441  vttmp * dtdz_local;
442 
443  w1 = wrtmp * drdx_local +
444  wstmp * dsdx_local +
445  wttmp * dtdx_local;
446  w2 = wrtmp * drdy_local +
447  wstmp * dsdy_local +
448  wttmp * dtdy_local;
449  w3 = wrtmp * drdz_local +
450  wstmp * dsdz_local +
451  wttmp * dtdz_local;
452 
453  T s11 = 0.0;
454  T s12 = 0.0;
455  T s13 = 0.0;
456  T s21 = 0.0;
457  T s22 = 0.0;
458  T s23 = 0.0;
459  T s31 = 0.0;
460  T s32 = 0.0;
461  T s33 = 0.0;
462 
463  s11 = dj*(u1 + u1);
464  s12 = dj*(u2 + v1);
465  s13 = dj*(u3 + w1);
466  s21 = dj*(v1 + u2);
467  s22 = dj*(v2 + v2);
468  s23 = dj*(v3 + w2);
469  s31 = dj*(w1 + u3);
470  s32 = dj*(w2 + v3);
471  s33 = dj*(w3 + w3);
472 
473  shur[ij] = drdx_local * s11 +
474  drdy_local * s12 +
475  drdz_local * s13;
476  shus[ij_p] = dsdx_local * s11 +
477  dsdy_local * s12 +
478  dsdz_local * s13;
479  rut = dtdx_local * s11 +
480  dtdy_local * s12 +
481  dtdz_local * s13;
482 
483  shvr[ij] = drdx_local * s21 +
484  drdy_local * s22 +
485  drdz_local * s23;
486  shvs[ij_p] = dsdx_local * s21 +
487  dsdy_local * s22 +
488  dsdz_local * s23;
489  rvt = dtdx_local * s21 +
490  dtdy_local * s22 +
491  dtdz_local * s23;
492 
493  shwr[ij] = drdx_local * s31 +
494  drdy_local * s32 +
495  drdz_local * s33;
496  shws[ij_p] = dsdx_local * s31 +
497  dsdy_local * s32 +
498  dsdz_local * s33;
499  rwt = dtdx_local * s31 +
500  dtdy_local * s32 +
501  dtdz_local * s33;
502 
503  __syncthreads();
504 
505  T uwijke = 0.0;
506  T vwijke = 0.0;
507  T wwijke = 0.0;
508 #pragma unroll
509  for (int l = 0; l < LX; l++){
510  uwijke += shur[l+j*LX] * shdx[l+i*(LX+1)];
511  ruw[l] += rut * shdz[k+l*(LX+1)];
512  uwijke += shus[i+l*(LX+1)] * shdy[l + j*(LX+1)];
513 
514  vwijke += shvr[l+j*LX] * shdx[l+i*(LX+1)];
515  rvw[l] += rvt * shdz[k+l*(LX+1)];
516  vwijke += shvs[i+l*(LX+1)] * shdy[l + j*(LX+1)];
517 
518  wwijke += shwr[l+j*LX] * shdx[l+i*(LX+1)];
519  rww[l] += rwt * shdz[k+l*(LX+1)];
520  wwijke += shws[i+l*(LX+1)] * shdy[l + j*(LX+1)];
521  }
522  ruw[k] += uwijke;
523  rvw[k] += vwijke;
524  rww[k] += wwijke;
525  }
526 #pragma unroll
527  for (int k = 0; k < LX; ++k){
528  au[ij + k*LX*LX + ele] = ruw[k];
529  av[ij + k*LX*LX + ele] = rvw[k];
530  aw[ij + k*LX*LX + ele] = rww[k];
531  }
532 }
533 
534 template< typename T >
535 __global__ void ax_helm_stress_kernel_vector_part2(T * __restrict__ au,
536  T * __restrict__ av,
537  T * __restrict__ aw,
538  const T * __restrict__ u,
539  const T * __restrict__ v,
540  const T * __restrict__ w,
541  const T * __restrict__ h2,
542  const T * __restrict__ B,
543  const int n) {
544 
545  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
546  const int str = blockDim.x * gridDim.x;
547 
548  for (int i = idx; i < n; i += str) {
549  au[i] = au[i] + h2[i] * B[i] * u[i];
550  av[i] = av[i] + h2[i] * B[i] * v[i];
551  aw[i] = aw[i] + h2[i] * B[i] * w[i];
552  }
553 
554 }
555 #endif // __MATH_AX_HELM_FULL_KERNEL_H__
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
T rv[LX]
__shared__ T shu[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ weight3
const int ij_p
__shared__ T shdy[LX *LX]
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_stress_kernel_vector_kstep(T *__restrict__ au
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__shared__ T shw[LX *LX]
T rvw[LX]
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__shared__ T shdz[LX *LX]
T rww[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__shared__ T shvs[LX *LX]
__shared__ T shws[LX *LX]
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
const int ij
shdx[ij]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
const int e
__global__ void T *__restrict__ av
T ruw[LX]
T ru[LX]
__global__ void ax_helm_stress_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__shared__ T shwr[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
const int ele
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
T rw[LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__shared__ T shus[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__syncthreads()
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__shared__ T shv[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__shared__ T shvr[LX *LX]
__shared__ T shur[LX *LX]
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113