Neko  0.9.99
A portable framework for high-order spectral element flow simulations
cdtp_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_CDTP_KERNEL_H__
2 #define __MATH_CDTP_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
41 template< typename T, const int LX, const int CHUNKS >
42 __global__ void cdtp_kernel_1d(T * __restrict__ dtx,
43  const T * __restrict__ x,
44  const T * __restrict__ dr,
45  const T * __restrict__ ds,
46  const T * __restrict__ dt,
47  const T * __restrict__ dxt,
48  const T * __restrict__ dyt,
49  const T * __restrict__ dzt,
50  const T * __restrict__ w3) {
51 
52  __shared__ T shdxt[LX * LX];
53  __shared__ T shdyt[LX * LX];
54  __shared__ T shdzt[LX * LX];
55 
56  __shared__ T shtar[LX * LX * LX];
57  __shared__ T shtas[LX * LX * LX];
58  __shared__ T shtat[LX * LX * LX];
59 
60  const int e = blockIdx.x;
61  const int iii = threadIdx.x;
62  const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
63 
64  if (iii < (LX * LX)) {
65  shdxt[iii] = dxt[iii];
66  shdyt[iii] = dyt[iii];
67  shdzt[iii] = dzt[iii];
68  }
69 
70  int l = iii;
71  while(l < (LX * LX * LX)) {
72  T wx = x[l + e * LX * LX * LX] * w3[l];
73 
74  shtar[l] = wx*dr[l + e * LX * LX * LX];
75  shtas[l] = wx*ds[l + e * LX * LX * LX];
76  shtat[l] = wx*dt[l + e * LX * LX * LX];
77 
78  l = l + CHUNKS;
79  }
80 
81  __syncthreads();
82  for (int n = 0; n < nchunks; n++) {
83  const int ijk = iii + n * CHUNKS;
84  const int jk = ijk / LX;
85  const int i = ijk - jk * LX;
86  const int k = jk / LX;
87  const int j = jk - k * LX;
88  if ( i < LX && j < LX && k < LX && ijk < LX*LX*LX) {
89  T rtmp = 0.0;
90  T stmp = 0.0;
91  T ttmp = 0.0;
92  for (int l = 0; l < LX; l++) {
93  rtmp += shdxt[i + l * LX] * shtar[l+j*LX+k*LX*LX];
94  stmp += shdyt[j + l * LX] * shtas[i+l*LX + k*LX*LX];
95  ttmp += shdzt[k + l * LX] * shtat[i + j*LX + l*LX*LX];
96  }
97  dtx[ijk + e * LX * LX * LX] = ( rtmp + stmp + ttmp );
98 
99  }
100  }
101 }
102 
103 template< typename T, const int LX >
104 __global__ void __launch_bounds__(LX*LX,3)
105  cdtp_kernel_kstep(T * __restrict__ dtx,
106  const T * __restrict__ x,
107  const T * __restrict__ dr,
108  const T * __restrict__ ds,
109  const T * __restrict__ dt,
110  const T * __restrict__ dxt,
111  const T * __restrict__ dyt,
112  const T * __restrict__ dzt,
113  const T * __restrict__ w3) {
114 
115  __shared__ T shdxt[LX * LX];
116  __shared__ T shdyt[LX * LX];
117  __shared__ T shdzt[LX * LX];
118 
119  __shared__ T shtar[LX * LX];
120  __shared__ T shtas[LX * LX];
121 
122  T rtar[LX];
123  T rtas[LX];
124  T rtat[LX];
125 
126  const int e = blockIdx.x;
127  const int j = threadIdx.y;
128  const int i = threadIdx.x;
129  const int ij = i + j * LX;
130  const int ele = e*LX*LX*LX;
131 
132  shdxt[ij] = dxt[ij];
133  shdyt[ij] = dyt[ij];
134  shdzt[ij] = dzt[ij];
135 
136 
137 #pragma unroll LX
138  for (int k = 0; k < LX; ++k) {
139  T wx = x[ij + k*LX*LX + ele] * w3[ij + k*LX*LX];
140 
141  rtar[k] = wx *dr[ij + k*LX*LX + ele];
142  rtas[k] = wx *ds[ij + k*LX*LX + ele];
143  rtat[k] = wx *dt[ij + k*LX*LX + ele];
144  }
145 
147 
148 #pragma unroll
149  for (int k = 0; k < LX; ++k) {
150  const int ijk = ij + k*LX*LX;
151  T ttmp = 0.0;
152  shtar[ij] = rtar[k];
153  shtas[ij] = rtas[k];
154  for (int l = 0; l < LX; l++) {
155  ttmp += shdzt[k+l*LX] * rtat[l];
156  }
157  __syncthreads();
158 
159  T rtmp = 0.0;
160  T stmp = 0.0;
161 #pragma unroll
162  for (int l = 0; l < LX; l++) {
163  rtmp += shdxt[i+l*LX] * shtar[l+j*LX];
164  stmp += shdyt[j+l*LX] * shtas[i+l*LX];
165  }
166 
167  dtx[ijk + ele] = ( rtmp + stmp + ttmp );
168 
169  __syncthreads();
170  }
171 }
172 
173 
174 #endif // __MATH_CDTP_KERNEL_H__
__shared__ T shdyt[LX *LX]
Definition: cdtp_kernel.h:116
__global__ void const T *__restrict__ const T *__restrict__ dr
Definition: cdtp_kernel.h:107
T rtas[LX]
Definition: cdtp_kernel.h:123
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
Definition: cdtp_kernel.h:108
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
Definition: cdtp_kernel.h:109
__shared__ T shtas[LX *LX]
Definition: cdtp_kernel.h:120
const int i
Definition: cdtp_kernel.h:128
T rtat[LX]
Definition: cdtp_kernel.h:124
const int ij
Definition: cdtp_kernel.h:129
const int e
Definition: cdtp_kernel.h:126
T rtar[LX]
Definition: cdtp_kernel.h:122
__global__ void __launch_bounds__(LX *LX, 3) cdtp_kernel_kstep(T *__restrict__ dtx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
__shared__ T shdzt[LX *LX]
Definition: cdtp_kernel.h:117
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:112
const int ele
Definition: cdtp_kernel.h:130
const int j
Definition: cdtp_kernel.h:127
__shared__ T shtar[LX *LX]
Definition: cdtp_kernel.h:119
__global__ void cdtp_kernel_1d(T *__restrict__ dtx, const T *__restrict__ x, const T *__restrict__ dr, const T *__restrict__ ds, const T *__restrict__ dt, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ w3)
Definition: cdtp_kernel.h:42
__syncthreads()
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:111
shdxt[ij]
Definition: cdtp_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:110