Neko  0.9.0
A portable framework for high-order spectral element flow simulations
dudxyz_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_DUDXYZ_KERNEL_H__
2 #define __MATH_DUDXYZ_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX, const int CHUNKS >
42 __global__ void dudxyz_kernel_1d(T * __restrict__ du,
43  const T * __restrict__ u,
44  const T * __restrict__ dr,
45  const T * __restrict__ ds,
46  const T * __restrict__ dt,
47  const T * __restrict__ dx,
48  const T * __restrict__ dy,
49  const T * __restrict__ dz,
50  const T * __restrict__ jacinv) {
51 
52  __shared__ T shu[LX * LX * LX];
53  __shared__ T shdr[LX * LX * LX];
54  __shared__ T shds[LX * LX * LX];
55  __shared__ T shdt[LX * LX * LX];
56 
57  __shared__ T shdx[LX * LX];
58  __shared__ T shdy[LX * LX];
59  __shared__ T shdz[LX * LX];
60 
61  __shared__ T shjacinv[LX * LX * LX];
62 
63  const int e = blockIdx.x;
64  const int iii = threadIdx.x;
65  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
66 
67  if (iii < (LX * LX)) {
68  shdx[iii] = dx[iii];
69  shdy[iii] = dy[iii];
70  shdz[iii] = dz[iii];
71  }
72 
73  int l = iii;
74  while(l < (LX * LX * LX)) {
75  shu[l] = u[l + e * LX * LX * LX];
76  shdr[l] = dr[l + e * LX * LX * LX];
77  shds[l] = ds[l + e * LX * LX * LX];
78  shdt[l] = dt[l + e * LX * LX * LX];
79  shjacinv[l] = jacinv[l + e * LX * LX * LX];
80  l = l + CHUNKS;
81  }
82 
83  __syncthreads();
84 
85  for (int n = 0; n < nchunks; n++) {
86  const int ijk = iii + n * CHUNKS;
87  const int jk = ijk / LX;
88  const int i = ijk - jk * LX;
89  const int k = jk / LX;
90  const int j = jk - k * LX;
91  if ( i < LX && j < LX && k < LX) {
92  T rtmp = 0.0;
93  T stmp = 0.0;
94  T ttmp = 0.0;
95  for (int l = 0; l < LX; l++) {
96  rtmp += shdx[i + l * LX] * shu[l + j * LX + k * LX * LX];
97  stmp += shdy[j + l * LX] * shu[i + l * LX + k * LX * LX];
98  ttmp += shdz[k + l * LX] * shu[i + j * LX + l * LX * LX];
99  }
100  du[ijk + e * LX * LX * LX] = ((rtmp * shdr[ijk])
101  + (stmp * shds[ijk])
102  + (ttmp * shdt[ijk]))
103  * shjacinv[ijk];
104 
105  }
106  }
107 }
108 
109 template< typename T, const int LX >
110 __global__ void __launch_bounds__(LX*LX,3)
111  dudxyz_kernel_kstep(T * __restrict__ du,
112  const T * __restrict__ u,
113  const T * __restrict__ dr,
114  const T * __restrict__ ds,
115  const T * __restrict__ dt,
116  const T * __restrict__ dx,
117  const T * __restrict__ dy,
118  const T * __restrict__ dz,
119  const T * __restrict__ jacinv) {
120 
121  __shared__ T shu[LX * LX];
122 
123  __shared__ T shdx[LX * LX];
124  __shared__ T shdy[LX * LX];
125  __shared__ T shdz[LX * LX];
126 
127  const int e = blockIdx.x;
128  const int j = threadIdx.y;
129  const int i = threadIdx.x;
130  const int ij = i + j * LX;
131  const int ele = e*LX*LX*LX;
132 
133  shdx[ij] = dx[ij];
134  shdy[ij] = dy[ij];
135  shdz[ij] = dz[ij];
136 
137  T ru[LX];
138  T rdr[LX];
139  T rds[LX];
140  T rdt[LX];
141  T rjacinv[LX];
142 
143  #pragma unroll LX
144  for (int k = 0; k < LX; ++k) {
145  ru[k] = u[ij + k*LX*LX + ele];
146  rdr[k] = dr[ij + k*LX*LX + ele];
147  rds[k] = ds[ij + k*LX*LX + ele];
148  rdt[k] = dt[ij + k*LX*LX + ele];
149  rjacinv[k] = jacinv[ij + k*LX*LX + ele];
150  }
151 
153 
154  #pragma unroll
155  for (int k = 0; k < LX; ++k) {
156  const int ijk = ij + k*LX*LX;
157  T ttmp = 0.0;
158  shu[ij] = ru[k];
159  for (int l = 0; l < LX; l++) {
160  ttmp += shdz[k+l*LX] * ru[l];
161  }
162  __syncthreads();
163 
164  T rtmp = 0.0;
165  T stmp = 0.0;
166 #pragma unroll
167  for (int l = 0; l < LX; l++) {
168  rtmp += shdx[i+l*LX] * shu[l+j*LX];
169  stmp += shdy[j+l*LX] * shu[i+l*LX];
170  }
171 
172  du[ijk + ele] = rjacinv[k] * ((rtmp * rdr[k])
173  + (stmp * rds[k])
174  + (ttmp * rdt[k]));
175  __syncthreads();
176  }
177 }
178 
179 #endif // __MATH_DUDXYZ_KERNEL_H__
__shared__ T shu[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
T ru[LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__shared__ T shdz[LX *LX]
T rdt[LX]
const int i
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
const int ij
__shared__ T shdy[LX *LX]
T rjacinv[LX]
const int e
T rds[LX]
__global__ void __launch_bounds__(LX *LX, 3) dudxyz_kernel_kstep(T *__restrict__ du
__global__ void dudxyz_kernel_1d(T *__restrict__ du, const T *__restrict__ u, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ jacinv)
Definition: dudxyz_kernel.h:42
__shared__ T shdx[LX *LX]
const int ele
__global__ void const T *__restrict__ u
const int j
T rdr[LX]
__syncthreads()
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv