1 #ifndef __MATH_TENSOR_KERNEL_H__
2 #define __MATH_TENSOR_KERNEL_H__
36 template<
typename T, const
int N >
39 const T * __restrict__
u,
41 const T * __restrict__ A,
42 const T * __restrict__ Bt,
43 const T * __restrict__ Ct,
46 __shared__ T shwork[N*N*N];
47 __shared__ T shwork2[N*N*N];
49 const int idx = threadIdx.x;
50 const int str = blockDim.x;
51 const int pt = blockIdx.x;
52 const int e = elements[pt];
54 for (
int ii = idx; ii< nu*nu*nv; ii += str) {
58 for(
int l = 0; l < nu; l++){
59 tmp += A[
i+l*nv+pt*nv*nu]*
u[l+nu*
j+
e*nu*nu*nu];
66 for (
int ijk = idx; ijk< nu*nv*nv; ijk += str) {
67 const int jk = ijk / nv;
68 const int i = ijk - jk * nv;
69 const int k = jk / nv;
70 const int j = jk - k * nv;
72 const int ik2 =
i + k*nv*nu;
73 for(
int l = 0; l < nu; l++){
74 tmp += Bt[l+
j*nu+pt*nv*nu]*shwork[l*nv+ik2];
81 for (
int ijk = idx; ijk< nv*nv*nv; ijk += str) {
82 const int jk = ijk / nv;
83 const int i = ijk - jk * nv;
84 const int k = jk / nv;
85 const int j = jk - k * nv;
87 const int ij2 =
i +
j*nv;
88 for(
int l = 0; l < nu; l++){
89 tmp += Ct[l+k*nu+pt*nv*nu]*shwork2[ij2 + l*nv*nv];
91 v[ijk+pt*nv*nv*nv] = tmp;
98 template<
typename T, const
int N >
101 const T * __restrict__
u,
103 const T * __restrict__ A,
104 const T * __restrict__ Bt,
105 const T * __restrict__ Ct) {
106 __shared__ T shwork[N*N*N];
107 __shared__ T shwork2[N*N*N];
109 const int idx = threadIdx.x;
110 const int str = blockDim.x;
111 const int e = blockIdx.x;
113 for (
int ii = idx; ii< nu*nu*nv; ii += str) {
117 for(
int l = 0; l < nu; l++){
118 tmp += A[
i+l*nv]*
u[l+nu*
j+
e*nu*nu*nu];
125 for (
int ijk = idx; ijk< nu*nv*nv; ijk += str) {
126 const int jk = ijk / nv;
127 const int i = ijk - jk * nv;
128 const int k = jk / nv;
129 const int j = jk - k * nv;
131 const int ik2 =
i + k*nv*nu;
132 for(
int l = 0; l < nu; l++){
133 tmp += Bt[l+
j*nu]*shwork[l*nv+ik2];
140 for (
int ijk = idx; ijk< nv*nv*nv; ijk += str) {
141 const int jk = ijk / nv;
142 const int i = ijk - jk * nv;
143 const int k = jk / nv;
144 const int j = jk - k * nv;
146 const int ij2 =
i +
j*nv;
147 for(
int l = 0; l < nu; l++){
148 tmp += Ct[l+k*nu]*shwork2[ij2 + l*nv*nv];
150 v[ijk+
e*nv*nv*nv] = tmp;
154 template<
typename T, const
int N >
157 const T * __restrict__
u,
159 const T * __restrict__ A,
160 const T * __restrict__ Bt,
161 const T * __restrict__ Ct) {
162 __shared__ T shwork[N*N*N];
165 const int idx = threadIdx.x;
166 const int str = blockDim.x;
167 const int e = blockIdx.x;
169 for (
int ii = idx; ii< nu*nu*nv; ii += str) {
173 for(
int l = 0; l < nu; l++){
174 tmp += A[
i+l*nv]*
u[l+nu*
j+
e*nu*nu*nu];
181 for (
int ijk = idx; ijk< nu*nv*nv; ijk += str) {
182 const int jk = ijk / nv;
183 const int i = ijk - jk * nv;
184 const int k = jk / nv;
185 const int j = jk - k * nv;
187 const int ik2 =
i + k*nv*nu;
188 for(
int l = 0; l < nu; l++){
189 tmp += Bt[l+
j*nu]*shwork[l*nv+ik2];
196 for (
int ijk = idx; ijk< nv*nv*nv; ijk += str) {
197 const int jk = ijk / nv;
198 const int i = ijk - jk * nv;
199 const int k = jk / nv;
200 const int j = jk - k * nv;
202 const int ij2 =
i +
j*nv;
203 for(
int l = 0; l < nu; l++){
204 tmp += Ct[l+k*nu]*shwork2[ij2 + l*nv*nv];
206 v[ijk+
e*nv*nv*nv] = tmp;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void tnsr3d_el_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct, const int *elements, const int n_points)
__global__ void tnsr3d_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct)
__global__ void tnsr3d_kernel_large(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct)