Neko  0.8.99
A portable framework for high-order spectral element flow simulations
gs_kernels.h
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #ifndef __GS_GS_KERNELS__
36 #define __GS_GS_KERNELS__
37 
42 template< typename T >
43 __global__ void gather_kernel_add(T * __restrict__ v,
44  const int m,
45  const int o,
46  const int * __restrict__ dg,
47  const T * __restrict__ u,
48  const int n,
49  const int * __restrict__ gd,
50  const int nb,
51  const int * __restrict__ b,
52  const int * __restrict__ bo) {
53 
54  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
55  const int str = blockDim.x * gridDim.x;
56 
57  for (int i = idx; i < nb; i += str) {
58  const int blk_len = b[i];
59  const int k = bo[i];
60  T tmp = u[gd[k] - 1];
61  for (int j = 1; j < blk_len; j++) {
62  tmp += u[gd[k + j] - 1];
63  }
64  v[dg[k] - 1] = tmp;
65  }
66 
67  if (o < 0) {
68  for (int i = ((abs(o) - 1) + idx); i < m ; i += str) {
69  v[dg[i] - 1] = u[gd[i] - 1];
70  }
71  }
72  else {
73  if ((idx%2 == 0)) {
74  for (int i = ((o - 1) + idx); i < m ; i += str) {
75  T tmp = u[gd[i] - 1] + u[gd[i+1] - 1];
76  v[dg[i] - 1] = tmp;
77  }
78  }
79  }
80 
81 }
82 
87 template< typename T >
88 __global__ void gather_kernel_mul(T * __restrict__ v,
89  const int m,
90  const int o,
91  const int * __restrict__ dg,
92  const T * __restrict__ u,
93  const int n,
94  const int * __restrict__ gd,
95  const int nb,
96  const int * __restrict__ b,
97  const int * __restrict__ bo) {
98 
99  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100  const int str = blockDim.x * gridDim.x;
101 
102  for (int i = idx; i < nb; i += str) {
103  const int blk_len = b[i];
104  const int k = bo[i];
105  T tmp = u[gd[k] - 1];
106  for (int j = 1; j < blk_len; j++) {
107  tmp *= u[gd[k + j] - 1];
108  }
109  v[dg[k] - 1] = tmp;
110  }
111 
112  if (o < 0) {
113  for (int i = ((abs(o) - 1) + idx); i < m ; i += str) {
114  v[dg[i] - 1] = u[gd[i] - 1];
115  }
116  }
117  else {
118  if ((idx%2 == 0)) {
119  for (int i = ((o - 1) + idx); i < m ; i += str) {
120  T tmp = u[gd[i] - 1] * u[gd[i+1] - 1];
121  v[dg[i] - 1] = tmp;
122  }
123  }
124  }
125 
126 }
127 
132 template< typename T >
133 __global__ void gather_kernel_min(T * __restrict__ v,
134  const int m,
135  const int o,
136  const int * __restrict__ dg,
137  const T * __restrict__ u,
138  const int n,
139  const int * __restrict__ gd,
140  const int nb,
141  const int *__restrict__ b,
142  const int *__restrict__ bo) {
143 
144  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
145  const int str = blockDim.x * gridDim.x;
146 
147  for (int i = idx; i < nb; i += str) {
148  const int blk_len = b[i];
149  const int k = bo[i];
150  T tmp = u[gd[k] - 1];
151  for (int j = 1; j < blk_len; j++) {
152  tmp = min(u[gd[k + j] - 1], tmp);
153  }
154  v[dg[k] - 1] = tmp;
155  }
156 
157  if (o < 0) {
158  for (int i = ((abs(o) - 1) + idx); i < m ; i += str) {
159  v[dg[i] - 1] = u[gd[i] - 1];
160  }
161  }
162  else {
163  if ((idx%2 == 0)) {
164  for (int i = ((o - 1) + idx); i < m ; i += str) {
165  T tmp = min(u[gd[i] - 1], u[gd[i+1] - 1]);
166  v[dg[i] - 1] = tmp;
167  }
168  }
169  }
170 
171 }
172 
177 template< typename T >
178 __global__ void gather_kernel_max(T * __restrict__ v,
179  const int m,
180  const int o,
181  const int * __restrict__ dg,
182  const T * __restrict__ u,
183  const int n,
184  const int * __restrict__ gd,
185  const int nb,
186  const int *__restrict__ b,
187  const int *__restrict__ bo) {
188 
189  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
190  const int str = blockDim.x * gridDim.x;
191 
192  for (int i = idx; i < nb; i += str) {
193  const int blk_len = b[i];
194  const int k = bo[i];
195  T tmp = u[gd[k] - 1];
196  for (int j = 1; j < blk_len; j++) {
197  tmp = max(u[gd[k + j] - 1], tmp);
198  }
199  v[dg[k] - 1] = tmp;
200  }
201 
202  if (o < 0) {
203  for (int i = ((abs(o) - 1) + idx); i < m ; i += str) {
204  v[dg[i] - 1] = u[gd[i] - 1];
205  }
206  }
207  else {
208  if ((idx%2 == 0)) {
209  for (int i = ((o - 1) + idx); i < m ; i += str) {
210  T tmp = max(u[gd[i] - 1], u[gd[i+1] - 1]);
211  v[dg[i] - 1] = tmp;
212  }
213  }
214  }
215 
216 }
217 
222 template< typename T >
223 __global__ void scatter_kernel(T * __restrict__ v,
224  const int m,
225  const int * __restrict__ dg,
226  T * __restrict__ u,
227  const int n,
228  const int * __restrict__ gd,
229  const int nb,
230  const int *__restrict__ b,
231  const int *__restrict__ bo) {
232 
233  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234  const int str = blockDim.x * gridDim.x;
235 
236  for (int i = idx; i < nb; i += str) {
237  const int blk_len = b[i];
238  const int k = bo[i];
239  T tmp = v[dg[k] - 1];
240  for (int j = 0; j < blk_len; j++) {
241  u[gd[k + j] - 1] = tmp;
242  }
243  }
244 
245  const int facet_offset = bo[nb - 1] + b[nb - 1];
246 
247  for (int i = ((facet_offset - 1) + idx); i < m; i += str) {
248  u[gd[i] - 1] = v[dg[i] - 1];
249  }
250 
251 }
252 
253 template< typename T >
254 __global__ void gs_pack_kernel(const T * __restrict__ u,
255  T * __restrict__ buf,
256  const int32_t * __restrict__ dof,
257  const int n) {
258 
259  const int j = threadIdx.x + blockDim.x * blockIdx.x;
260 
261  if (j >= n)
262  return;
263 
264  buf[j] = u[dof[j]-1];
265 }
266 
267 
268 template< typename T >
269 __global__ void gs_unpack_add_kernel(T * __restrict__ u,
270  const T * __restrict__ buf,
271  const int32_t * __restrict__ dof,
272  const int n) {
273 
274  const int j = threadIdx.x + blockDim.x * blockIdx.x;
275 
276  if (j >= n)
277  return;
278 
279  const int32_t idx = dof[j];
280  const T val = buf[j];
281  if (idx < 0) {
282 #if __CUDA_ARCH__ >= 600
283  atomicAdd(&u[-idx-1], val);
284 #endif
285  } else {
286  u[idx-1] += val;
287  }
288 }
289 
290 #endif // __GS_GS_KERNELS__
const int i
Definition: cdtp_kernel.h:128
const int j
Definition: cdtp_kernel.h:127
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void gather_kernel_min(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
Definition: gs_kernels.h:133
__global__ void scatter_kernel(T *__restrict__ v, const int m, const int *__restrict__ dg, T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
Definition: gs_kernels.h:223
__global__ void gather_kernel_mul(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
Definition: gs_kernels.h:88
__global__ void gs_pack_kernel(const T *__restrict__ u, T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
Definition: gs_kernels.h:254
__global__ void gather_kernel_add(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
Definition: gs_kernels.h:43
__global__ void gs_unpack_add_kernel(T *__restrict__ u, const T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
Definition: gs_kernels.h:269
__global__ void gather_kernel_max(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
Definition: gs_kernels.h:178
__global__ void const T *__restrict__ const T *__restrict__ v
real * buf
Definition: pipecg_aux.cu:42
#define max(a, b)
Definition: tensor.cu:40