Neko  0.8.99
A portable framework for high-order spectral element flow simulations
coef_kernel.h
Go to the documentation of this file.
1 #ifndef __SEM_COEF_KERNEL_H__
2 #define __SEM_COEF_KERNEL_H__
3 /*
4  Copyright (c) 2022, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T, const int LX, const int CHUNKS >
41 __global__ void coef_generate_geo_kernel(T * __restrict__ G11,
42  T * __restrict__ G12,
43  T * __restrict__ G13,
44  T * __restrict__ G22,
45  T * __restrict__ G23,
46  T * __restrict__ G33,
47  const T * __restrict__ drdx,
48  const T * __restrict__ drdy,
49  const T * __restrict__ drdz,
50  const T * __restrict__ dsdx,
51  const T * __restrict__ dsdy,
52  const T * __restrict__ dsdz,
53  const T * __restrict__ dtdx,
54  const T * __restrict__ dtdy,
55  const T * __restrict__ dtdz,
56  const T * __restrict__ jacinv,
57  const T * __restrict__ w3,
58  const int gdim) {
59 
60  int i,j,k;
61 
62  const int e = blockIdx.x;
63  const int iii = threadIdx.x;
64  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
65 
66  __shared__ T shw3[LX * LX * LX];
67 
68  j = iii;
69  while( j < (LX * LX * LX)) {
70  const int i = j + e * LX * LX * LX;
71  G11[i] = (drdx[i]*drdx[i] + drdy[i]*drdy[i] + drdz[i]*drdz[i]) * jacinv[i];
72  G22[i] = (dsdx[i]*dsdx[i] + dsdy[i]*dsdy[i] + dsdz[i]*dsdz[i]) * jacinv[i];
73  G33[i] = (dtdx[i]*dtdx[i] + dtdy[i]*dtdy[i] + dtdz[i]*dtdz[i]) * jacinv[i];
74 
75  G12[i] = (drdx[i]*dsdx[i] + drdy[i]*dsdy[i] + drdz[i]*dsdz[i]) * jacinv[i];
76  G13[i] = (drdx[i]*dtdx[i] + drdy[i]*dtdy[i] + drdz[i]*dtdz[i]) * jacinv[i];
77  G23[i] = (dsdx[i]*dtdx[i] + dsdy[i]*dtdy[i] + dsdz[i]*dtdz[i]) * jacinv[i];
78 
79  shw3[j] = w3[j];
80 
81  j = j + CHUNKS;
82  }
83 
84  __syncthreads();
85 
86  for (int n = 0; n < nchunks; n++) {
87  const int ijk = iii + n * CHUNKS;
88  const int jk = ijk / LX;
89  i = ijk - jk * LX;
90  k = jk / LX;
91  j = jk - k * LX;
92  if ( i < LX && j < LX && k < LX) {
93  G11[ijk + e * LX * LX * LX] *= shw3[ijk];
94  G12[ijk + e * LX * LX * LX] *= shw3[ijk];
95  G13[ijk + e * LX * LX * LX] *= shw3[ijk];
96  G22[ijk + e * LX * LX * LX] *= shw3[ijk];
97  G23[ijk + e * LX * LX * LX] *= shw3[ijk];
98  G33[ijk + e * LX * LX * LX] *= shw3[ijk];
99  }
100  }
101 }
102 
106 template< typename T, const int LX, const int CHUNKS >
107 __global__ void coef_generate_dxyz_kernel(T * __restrict__ dxdr,
108  T * __restrict__ dydr,
109  T * __restrict__ dzdr,
110  T * __restrict__ dxds,
111  T * __restrict__ dyds,
112  T * __restrict__ dzds,
113  T * __restrict__ dxdt,
114  T * __restrict__ dydt,
115  T * __restrict__ dzdt,
116  const T * __restrict__ dx,
117  const T * __restrict__ dy,
118  const T * __restrict__ dz,
119  const T * __restrict__ x,
120  const T * __restrict__ y,
121  const T * __restrict__ z) {
122 
123  int i,j,k;
124 
125  const int e = blockIdx.x;
126  const int iii = threadIdx.x;
127  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
128 
129  __shared__ T shdx[LX * LX];
130  __shared__ T shdy[LX * LX];
131  __shared__ T shdz[LX * LX];
132 
133  __shared__ T shu[LX * LX * LX];
134 
135  if (iii < (LX * LX)) {
136  shdx[iii] = dx[iii];
137  shdy[iii] = dy[iii];
138  shdz[iii] = dz[iii];
139  }
140 
141  j = iii;
142  while(j < (LX * LX * LX)) {
143  shu[j] = x[j + e * LX * LX * LX];
144  j = j + CHUNKS;
145  }
146 
147  __syncthreads();
148 
149  for (int n = 0; n < nchunks; n++) {
150  const int ijk = iii + n * CHUNKS;
151  const int jk = ijk / LX;
152  i = ijk - jk * LX;
153  k = jk / LX;
154  j = jk - k * LX;
155  if ( i < LX && j < LX && k < LX) {
156  T rtmp = 0.0;
157  T stmp = 0.0;
158  T ttmp = 0.0;
159  for (int l = 0; l < LX; l++) {
160  rtmp += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
161  stmp += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
162  ttmp += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
163  }
164  dxdr[ijk + e * LX * LX * LX] = rtmp;
165  dxds[ijk + e * LX * LX * LX] = stmp;
166  dxdt[ijk + e * LX * LX * LX] = ttmp;
167  }
168  }
169 
170  __syncthreads();
171 
172  j = iii;
173  while(j < (LX * LX * LX)) {
174  shu[j] = y[j + e * LX * LX * LX];
175  j = j + CHUNKS;
176  }
177 
178  __syncthreads();
179 
180  for (int n = 0; n < nchunks; n++) {
181  const int ijk = iii + n * CHUNKS;
182  const int jk = ijk / LX;
183  i = ijk - jk * LX;
184  k = jk / LX;
185  j = jk - k * LX;
186  if ( i < LX && j < LX && k < LX) {
187  T rtmp = 0.0;
188  T stmp = 0.0;
189  T ttmp = 0.0;
190  for (int l = 0; l < LX; l++) {
191  rtmp += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
192  stmp += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
193  ttmp += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
194  }
195  dydr[ijk + e * LX * LX * LX] = rtmp;
196  dyds[ijk + e * LX * LX * LX] = stmp;
197  dydt[ijk + e * LX * LX * LX] = ttmp;
198  }
199  }
200 
201  __syncthreads();
202 
203  j = iii;
204  while(j < (LX * LX * LX)) {
205  shu[j] = z[j + e * LX * LX * LX];
206  j = j + CHUNKS;
207  }
208 
209  __syncthreads();
210 
211  for (int n = 0; n < nchunks; n++) {
212  const int ijk = iii + n * CHUNKS;
213  const int jk = ijk / LX;
214  i = ijk - jk * LX;
215  k = jk / LX;
216  j = jk - k * LX;
217  if ( i < LX && j < LX && k < LX) {
218  T rtmp = 0.0;
219  T stmp = 0.0;
220  T ttmp = 0.0;
221  for (int l = 0; l < LX; l++) {
222  rtmp += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
223  stmp += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
224  ttmp += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
225  }
226  dzdr[ijk + e * LX * LX * LX] = rtmp;
227  dzds[ijk + e * LX * LX * LX] = stmp;
228  dzdt[ijk + e * LX * LX * LX] = ttmp;
229  }
230  }
231 }
232 
236 template< typename T >
237 __global__ void coef_generate_drst_kernel(T * __restrict__ jac,
238  T * __restrict__ jacinv,
239  T * __restrict__ drdx,
240  T * __restrict__ drdy,
241  T * __restrict__ drdz,
242  T * __restrict__ dsdx,
243  T * __restrict__ dsdy,
244  T * __restrict__ dsdz,
245  T * __restrict__ dtdx,
246  T * __restrict__ dtdy,
247  T * __restrict__ dtdz,
248  const T * __restrict__ dxdr,
249  const T * __restrict__ dydr,
250  const T * __restrict__ dzdr,
251  const T * __restrict__ dxds,
252  const T * __restrict__ dyds,
253  const T * __restrict__ dzds,
254  const T * __restrict__ dxdt,
255  const T * __restrict__ dydt,
256  const T * __restrict__ dzdt,
257  const int n) {
258 
259  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
260  const int str = blockDim.x * gridDim.x;
261  const T one = 1.0;
262 
263  for (int i = idx; i < n; i += str) {
264  jac[i] = (dxdr[i] * dyds[i] * dzdt[i])
265  + (dxdt[i] * dydr[i] * dzds[i])
266  + (dxds[i] * dydt[i] * dzdr[i])
267  - (dxdr[i] * dydt[i] * dzds[i])
268  - (dxds[i] * dydr[i] * dzdt[i])
269  - (dxdt[i] * dyds[i] * dzdr[i]);
270  jacinv[i] = one / jac[i];
271 
272  drdx[i] = dyds[i]*dzdt[i] - dydt[i]*dzds[i];
273  drdy[i] = dxdt[i]*dzds[i] - dxds[i]*dzdt[i];
274  drdz[i] = dxds[i]*dydt[i] - dxdt[i]*dyds[i];
275  dsdx[i] = dydt[i]*dzdr[i] - dydr[i]*dzdt[i];
276  dsdy[i] = dxdr[i]*dzdt[i] - dxdt[i]*dzdr[i];
277  dsdz[i] = dxdt[i]*dydr[i] - dxdr[i]*dydt[i];
278  dtdx[i] = dydr[i]*dzds[i] - dyds[i]*dzdr[i];
279  dtdy[i] = dxds[i]*dzdr[i] - dxdr[i]*dzds[i];
280  dtdz[i] = dxdr[i]*dyds[i] - dxds[i]*dydr[i];
281 
282  }
283 
284 }
285 
286 #endif // __SEM_COEF_KERNEL_H__
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
const int i
Definition: cdtp_kernel.h:128
const int e
Definition: cdtp_kernel.h:126
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
const int j
Definition: cdtp_kernel.h:127
__syncthreads()
__global__ void coef_generate_dxyz_kernel(T *__restrict__ dxdr, T *__restrict__ dydr, T *__restrict__ dzdr, T *__restrict__ dxds, T *__restrict__ dyds, T *__restrict__ dzds, T *__restrict__ dxdt, T *__restrict__ dydt, T *__restrict__ dzdt, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ x, const T *__restrict__ y, const T *__restrict__ z)
Definition: coef_kernel.h:107
__global__ void coef_generate_drst_kernel(T *__restrict__ jac, T *__restrict__ jacinv, T *__restrict__ drdx, T *__restrict__ drdy, T *__restrict__ drdz, T *__restrict__ dsdx, T *__restrict__ dsdy, T *__restrict__ dsdz, T *__restrict__ dtdx, T *__restrict__ dtdy, T *__restrict__ dtdz, const T *__restrict__ dxdr, const T *__restrict__ dydr, const T *__restrict__ dzdr, const T *__restrict__ dxds, const T *__restrict__ dyds, const T *__restrict__ dzds, const T *__restrict__ dxdt, const T *__restrict__ dydt, const T *__restrict__ dzdt, const int n)
Definition: coef_kernel.h:237
__global__ void coef_generate_geo_kernel(T *__restrict__ G11, T *__restrict__ G12, T *__restrict__ G13, T *__restrict__ G22, T *__restrict__ G23, T *__restrict__ G33, const T *__restrict__ drdx, const T *__restrict__ drdy, const T *__restrict__ drdz, const T *__restrict__ dsdx, const T *__restrict__ dsdy, const T *__restrict__ dsdz, const T *__restrict__ dtdx, const T *__restrict__ dtdy, const T *__restrict__ dtdz, const T *__restrict__ jacinv, const T *__restrict__ w3, const int gdim)
Definition: coef_kernel.h:41
__shared__ T shdy[LX *LX]
Definition: conv1_kernel.h:153
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
Definition: conv1_kernel.h:148
__shared__ T shdz[LX *LX]
Definition: conv1_kernel.h:154
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
Definition: conv1_kernel.h:139
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
Definition: conv1_kernel.h:141
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
Definition: conv1_kernel.h:147
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
Definition: conv1_kernel.h:136
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
Definition: conv1_kernel.h:140
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
Definition: conv1_kernel.h:144
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
Definition: conv1_kernel.h:146
__shared__ T shdx[LX *LX]
Definition: conv1_kernel.h:152
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
Definition: conv1_kernel.h:143
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
Definition: conv1_kernel.h:142
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
Definition: conv1_kernel.h:138
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
Definition: conv1_kernel.h:145
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
Definition: conv1_kernel.h:137
__shared__ T shu[LX *LX]