Neko  0.9.0
A portable framework for high-order spectral element flow simulations
tensor_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_TENSOR_KERNEL_H__
2 #define __MATH_TENSOR_KERNEL_H__
3 /*
4  Copyright (c) 2021-2022, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 template< typename T, const int N >
37 __global__ void tnsr3d_el_kernel(T * __restrict__ v,
38  const int nv,
39  const T * __restrict__ u,
40  const int nu,
41  const T * __restrict__ A,
42  const T * __restrict__ Bt,
43  const T * __restrict__ Ct,
44  const int * elements,
45  const int n_points) {
46  __shared__ T shwork[N*N*N];
47  __shared__ T shwork2[N*N*N];
48 
49  const int idx = threadIdx.x;
50  const int str = blockDim.x;
51  const int pt = blockIdx.x;
52  const int e = elements[pt];
53 
54  for (int ii = idx; ii< nu*nu*nv; ii += str) {
55  T tmp = 0.0;
56  int j = ii/nv;
57  int i = ii - j*nv;
58  for( int l = 0; l < nu; l++){
59  tmp += A[i+l*nv+pt*nv*nu]*u[l+nu*j+e*nu*nu*nu];
60  }
61  shwork[ii] = tmp;
62  }
63 
64  __syncthreads();
65 
66  for (int ijk = idx; ijk< nu*nv*nv; ijk += str) {
67  const int jk = ijk / nv;
68  const int i = ijk - jk * nv;
69  const int k = jk / nv;
70  const int j = jk - k * nv;
71  T tmp = 0.0;
72  const int ik2 = i + k*nv*nu;
73  for( int l = 0; l < nu; l++){
74  tmp += Bt[l+j*nu+pt*nv*nu]*shwork[l*nv+ik2];
75  }
76  shwork2[ijk] = tmp;
77  }
78 
79  __syncthreads();
80 
81  for (int ijk = idx; ijk< nv*nv*nv; ijk += str) {
82  const int jk = ijk / nv;
83  const int i = ijk - jk * nv;
84  const int k = jk / nv;
85  const int j = jk - k * nv;
86  T tmp = 0.0;
87  const int ij2 = i + j*nv;
88  for( int l = 0; l < nu; l++){
89  tmp += Ct[l+k*nu+pt*nv*nu]*shwork2[ij2 + l*nv*nv];
90  }
91  v[ijk+pt*nv*nv*nv] = tmp;
92  }
93 
94 }
95 
96 
97 
98 template< typename T, const int N >
99 __global__ void tnsr3d_kernel(T * __restrict__ v,
100  const int nv,
101  const T * __restrict__ u,
102  const int nu,
103  const T * __restrict__ A,
104  const T * __restrict__ Bt,
105  const T * __restrict__ Ct) {
106  __shared__ T shwork[N*N*N];
107  __shared__ T shwork2[N*N*N];
108 
109  const int idx = threadIdx.x;
110  const int str = blockDim.x;
111  const int e = blockIdx.x;
112 
113  for (int ii = idx; ii< nu*nu*nv; ii += str) {
114  T tmp = 0.0;
115  int j = ii/nv;
116  int i = ii - j*nv;
117  for( int l = 0; l < nu; l++){
118  tmp += A[i+l*nv]*u[l+nu*j+e*nu*nu*nu];
119  }
120  shwork[ii] = tmp;
121  }
122 
123  __syncthreads();
124 
125  for (int ijk = idx; ijk< nu*nv*nv; ijk += str) {
126  const int jk = ijk / nv;
127  const int i = ijk - jk * nv;
128  const int k = jk / nv;
129  const int j = jk - k * nv;
130  T tmp = 0.0;
131  const int ik2 = i + k*nv*nu;
132  for( int l = 0; l < nu; l++){
133  tmp += Bt[l+j*nu]*shwork[l*nv+ik2];
134  }
135  shwork2[ijk] = tmp;
136  }
137 
138  __syncthreads();
139 
140  for (int ijk = idx; ijk< nv*nv*nv; ijk += str) {
141  const int jk = ijk / nv;
142  const int i = ijk - jk * nv;
143  const int k = jk / nv;
144  const int j = jk - k * nv;
145  T tmp = 0.0;
146  const int ij2 = i + j*nv;
147  for( int l = 0; l < nu; l++){
148  tmp += Ct[l+k*nu]*shwork2[ij2 + l*nv*nv];
149  }
150  v[ijk+e*nv*nv*nv] = tmp;
151  }
152 }
153 
154 template< typename T, const int N >
155 __global__ void tnsr3d_kernel_large(T * __restrict__ v,
156  const int nv,
157  const T * __restrict__ u,
158  const int nu,
159  const T * __restrict__ A,
160  const T * __restrict__ Bt,
161  const T * __restrict__ Ct) {
162  __shared__ T shwork[N*N*N];
163  T shwork2[N*N*N];
164 
165  const int idx = threadIdx.x;
166  const int str = blockDim.x;
167  const int e = blockIdx.x;
168 
169  for (int ii = idx; ii< nu*nu*nv; ii += str) {
170  T tmp = 0.0;
171  int j = ii/nv;
172  int i = ii - j*nv;
173  for( int l = 0; l < nu; l++){
174  tmp += A[i+l*nv]*u[l+nu*j+e*nu*nu*nu];
175  }
176  shwork[ii] = tmp;
177  }
178 
179  __syncthreads();
180 
181  for (int ijk = idx; ijk< nu*nv*nv; ijk += str) {
182  const int jk = ijk / nv;
183  const int i = ijk - jk * nv;
184  const int k = jk / nv;
185  const int j = jk - k * nv;
186  T tmp = 0.0;
187  const int ik2 = i + k*nv*nu;
188  for( int l = 0; l < nu; l++){
189  tmp += Bt[l+j*nu]*shwork[l*nv+ik2];
190  }
191  shwork2[ijk] = tmp;
192  }
193 
194  __syncthreads();
195 
196  for (int ijk = idx; ijk< nv*nv*nv; ijk += str) {
197  const int jk = ijk / nv;
198  const int i = ijk - jk * nv;
199  const int k = jk / nv;
200  const int j = jk - k * nv;
201  T tmp = 0.0;
202  const int ij2 = i + j*nv;
203  for( int l = 0; l < nu; l++){
204  tmp += Ct[l+k*nu]*shwork2[ij2 + l*nv*nv];
205  }
206  v[ijk+e*nv*nv*nv] = tmp;
207  }
208 }
209 
210 
211 
212 #endif // __MATH_TENSOR_KERNEL_H__
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__syncthreads()
__global__ void tnsr3d_el_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct, const int *elements, const int n_points)
Definition: tensor_kernel.h:37
__global__ void tnsr3d_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct)
Definition: tensor_kernel.h:99
__global__ void tnsr3d_kernel_large(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct)