Neko  0.8.1
A portable framework for high-order spectral element flow simulations
cdtp_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_CDTP_KERNEL_H__
2 #define __MATH_CDTP_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX, const int CHUNKS >
42 __global__ void cdtp_kernel_1d(T * __restrict__ dtx,
43  const T * __restrict__ x,
44  const T * __restrict__ dr,
45  const T * __restrict__ ds,
46  const T * __restrict__ dt,
47  const T * __restrict__ dxt,
48  const T * __restrict__ dyt,
49  const T * __restrict__ dzt,
50  const T * __restrict__ B,
51  const T * __restrict__ jac) {
52 
53  __shared__ T shdxt[LX * LX];
54  __shared__ T shdyt[LX * LX];
55  __shared__ T shdzt[LX * LX];
56 
57  __shared__ T shtar[LX * LX * LX];
58  __shared__ T shtas[LX * LX * LX];
59  __shared__ T shtat[LX * LX * LX];
60 
61  const int e = blockIdx.x;
62  const int iii = threadIdx.x;
63  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
64 
65  if (iii < (LX * LX)) {
66  shdxt[iii] = dxt[iii];
67  shdyt[iii] = dyt[iii];
68  shdzt[iii] = dzt[iii];
69  }
70 
71  int l = iii;
72  while(l < (LX * LX * LX)) {
73  // We can probably avoid this division by use of jacinv instead.
74  T wx = (x[l + e * LX * LX * LX] * B[l + e * LX * LX * LX]) /
75  jac[l + e * LX * LX * LX];
76 
77  shtar[l] = wx*dr[l + e * LX * LX * LX];
78  shtas[l] = wx*ds[l + e * LX * LX * LX];
79  shtat[l] = wx*dt[l + e * LX * LX * LX];
80 
81  l = l + CHUNKS;
82  }
83 
84  __syncthreads();
85  for (int n = 0; n < nchunks; n++) {
86  const int ijk = iii + n * CHUNKS;
87  const int jk = ijk / LX;
88  const int i = ijk - jk * LX;
89  const int k = jk / LX;
90  const int j = jk - k * LX;
91  if ( i < LX && j < LX && k < LX && ijk < LX*LX*LX) {
92  T rtmp = 0.0;
93  T stmp = 0.0;
94  T ttmp = 0.0;
95  for (int l = 0; l < LX; l++) {
96  rtmp += shdxt[i + l * LX] * shtar[l+j*LX+k*LX*LX];
97  stmp += shdyt[j + l * LX] * shtas[i+l*LX + k*LX*LX];
98  ttmp += shdzt[k + l * LX] * shtat[i + j*LX + l*LX*LX];
99  }
100  dtx[ijk + e * LX * LX * LX] = ( rtmp + stmp + ttmp );
101 
102  }
103  }
104 }
105 
106 template< typename T, const int LX >
107 __global__ void __launch_bounds__(LX*LX,3)
108  cdtp_kernel_kstep(T * __restrict__ dtx,
109  const T * __restrict__ x,
110  const T * __restrict__ dr,
111  const T * __restrict__ ds,
112  const T * __restrict__ dt,
113  const T * __restrict__ dxt,
114  const T * __restrict__ dyt,
115  const T * __restrict__ dzt,
116  const T * __restrict__ B,
117  const T * __restrict__ jac) {
118 
119  __shared__ T shdxt[LX * LX];
120  __shared__ T shdyt[LX * LX];
121  __shared__ T shdzt[LX * LX];
122 
123  __shared__ T shtar[LX * LX];
124  __shared__ T shtas[LX * LX];
125 
126  T rtar[LX];
127  T rtas[LX];
128  T rtat[LX];
129 
130  const int e = blockIdx.x;
131  const int j = threadIdx.y;
132  const int i = threadIdx.x;
133  const int ij = i + j * LX;
134  const int ele = e*LX*LX*LX;
135 
136  shdxt[ij] = dxt[ij];
137  shdyt[ij] = dyt[ij];
138  shdzt[ij] = dzt[ij];
139 
140 
141 #pragma unroll LX
142  for (int k = 0; k < LX; ++k) {
143  T wx = (x[ij + k*LX*LX + ele] * B[ij + k*LX*LX + ele]) /
144  jac[ij + k*LX*LX + ele];
145 
146  rtar[k] = wx *dr[ij + k*LX*LX + ele];
147  rtas[k] = wx *ds[ij + k*LX*LX + ele];
148  rtat[k] = wx *dt[ij + k*LX*LX + ele];
149  }
150 
152 
153 #pragma unroll
154  for (int k = 0; k < LX; ++k) {
155  const int ijk = ij + k*LX*LX;
156  T ttmp = 0.0;
157  shtar[ij] = rtar[k];
158  shtas[ij] = rtas[k];
159  for (int l = 0; l < LX; l++) {
160  ttmp += shdzt[k+l*LX] * rtat[l];
161  }
162  __syncthreads();
163 
164  T rtmp = 0.0;
165  T stmp = 0.0;
166 #pragma unroll
167  for (int l = 0; l < LX; l++) {
168  rtmp += shdxt[i+l*LX] * shtar[l+j*LX];
169  stmp += shdyt[j+l*LX] * shtas[i+l*LX];
170  }
171 
172  dtx[ijk + ele] = ( rtmp + stmp + ttmp );
173 
174  __syncthreads();
175  }
176 }
177 
178 
179 #endif // __MATH_CDTP_KERNEL_H__
__shared__ T shdyt[LX *LX]
Definition: cdtp_kernel.h:120
__global__ void const T *__restrict__ const T *__restrict__ dr
Definition: cdtp_kernel.h:110
T rtas[LX]
Definition: cdtp_kernel.h:127
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
Definition: cdtp_kernel.h:111
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:109
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
Definition: cdtp_kernel.h:112
__shared__ T shtas[LX *LX]
Definition: cdtp_kernel.h:124
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ B
Definition: cdtp_kernel.h:116
const int i
Definition: cdtp_kernel.h:132
T rtat[LX]
Definition: cdtp_kernel.h:128
const int ij
Definition: cdtp_kernel.h:133
const int e
Definition: cdtp_kernel.h:130
T rtar[LX]
Definition: cdtp_kernel.h:126
__global__ void __launch_bounds__(LX *LX, 3) cdtp_kernel_kstep(T *__restrict__ dtx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jac
Definition: cdtp_kernel.h:117
__shared__ T shdzt[LX *LX]
Definition: cdtp_kernel.h:121
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:115
const int ele
Definition: cdtp_kernel.h:134
const int j
Definition: cdtp_kernel.h:131
__shared__ T shtar[LX *LX]
Definition: cdtp_kernel.h:123
__syncthreads()
__global__ void cdtp_kernel_1d(T *__restrict__ dtx, const T *__restrict__ x, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ B, const T *__restrict__ jac)
Definition: cdtp_kernel.h:42
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:114
shdxt[ij]
Definition: cdtp_kernel.h:136
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:113