Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
cdtp_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_CDTP_KERNEL_H__
2#define __MATH_CDTP_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
41template< typename T, const int LX, const int CHUNKS >
43 const T * __restrict__ x,
44 const T * __restrict__ dr,
45 const T * __restrict__ ds,
46 const T * __restrict__ dt,
47 const T * __restrict__ dxt,
48 const T * __restrict__ dyt,
49 const T * __restrict__ dzt,
50 const T * __restrict__ w3) {
51
55
56 __shared__ T shtar[LX * LX * LX];
57 __shared__ T shtas[LX * LX * LX];
58 __shared__ T shtat[LX * LX * LX];
59
60 const int e = blockIdx.x;
61 const int iii = threadIdx.x;
62 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
63
64 if (iii < (LX * LX)) {
65 shdxt[iii] = dxt[iii];
66 shdyt[iii] = dyt[iii];
67 shdzt[iii] = dzt[iii];
68 }
69
70 int l = iii;
71 while(l < (LX * LX * LX)) {
72 T wx = x[l + e * LX * LX * LX] * w3[l];
73
74 shtar[l] = wx*dr[l + e * LX * LX * LX];
75 shtas[l] = wx*ds[l + e * LX * LX * LX];
76 shtat[l] = wx*dt[l + e * LX * LX * LX];
77
78 l = l + CHUNKS;
79 }
80
82 for (int n = 0; n < nchunks; n++) {
83 const int ijk = iii + n * CHUNKS;
84 const int jk = ijk / LX;
85 const int i = ijk - jk * LX;
86 const int k = jk / LX;
87 const int j = jk - k * LX;
88 if ( i < LX && j < LX && k < LX && ijk < LX*LX*LX) {
89 T rtmp = 0.0;
90 T stmp = 0.0;
91 T ttmp = 0.0;
92 for (int l = 0; l < LX; l++) {
93 rtmp += shdxt[i + l * LX] * shtar[l+j*LX+k*LX*LX];
94 stmp += shdyt[j + l * LX] * shtas[i+l*LX + k*LX*LX];
95 ttmp += shdzt[k + l * LX] * shtat[i + j*LX + l*LX*LX];
96 }
97 dtx[ijk + e * LX * LX * LX] = ( rtmp + stmp + ttmp );
98
99 }
100 }
101}
102
103template< typename T, const int LX >
114
118
121
125
126 const int e = blockIdx.x;
127 const int j = threadIdx.y;
128 const int i = threadIdx.x;
129 const int ij = i + j * LX;
130 const int ele = e*LX*LX*LX;
131
133 shdyt[ij] = dyt[ij];
134 shdzt[ij] = dzt[ij];
135
136
137#pragma unroll LX
138 for (int k = 0; k < LX; ++k) {
139 T wx = x[ij + k*LX*LX + ele] * w3[ij + k*LX*LX];
140
141 rtar[k] = wx *dr[ij + k*LX*LX + ele];
142 rtas[k] = wx *ds[ij + k*LX*LX + ele];
143 rtat[k] = wx *dt[ij + k*LX*LX + ele];
144 }
145
147
148#pragma unroll
149 for (int k = 0; k < LX; ++k) {
150 const int ijk = ij + k*LX*LX;
151 T ttmp = 0.0;
152 shtar[ij] = rtar[k];
153 shtas[ij] = rtas[k];
154 for (int l = 0; l < LX; l++) {
155 ttmp += shdzt[k+l*LX] * rtat[l];
156 }
158
159 T rtmp = 0.0;
160 T stmp = 0.0;
161#pragma unroll
162 for (int l = 0; l < LX; l++) {
163 rtmp += shdxt[i+l*LX] * shtar[l+j*LX];
164 stmp += shdyt[j+l*LX] * shtas[i+l*LX];
165 }
166
167 dtx[ijk + ele] = ( rtmp + stmp + ttmp );
168
170 }
171}
172
173
174#endif // __MATH_CDTP_KERNEL_H__
__shared__ T shdyt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ dr
T rtas[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__shared__ T shtas[LX *LX]
const int i
T rtat[LX]
const int ij
const int e
T rtar[LX]
__global__ void __launch_bounds__(LX *LX, 3) cdtp_kernel_kstep(T *__restrict__ dtx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__shared__ T shdzt[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
const int ele
const int j
__shared__ T shtar[LX *LX]
__global__ void cdtp_kernel_1d(T *__restrict__ dtx, const T *__restrict__ x, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ w3)
Definition cdtp_kernel.h:42
__syncthreads()
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
shdxt[ij]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)