1 #ifndef __MATH_TENSOR_KERNEL_H__
2 #define __MATH_TENSOR_KERNEL_H__
36 template<
typename T, const
int N >
39 const T * __restrict__
u,
41 const T * __restrict__ A,
42 const T * __restrict__ Bt,
43 const T * __restrict__ Ct,
46 __shared__ T shwork[N*N*N];
47 __shared__ T shwork2[N*N*N];
49 const int idx = threadIdx.x;
50 const int str = blockDim.x;
51 const int pt = blockIdx.x;
52 const int e = elements[pt];
54 for (
int ii = idx; ii< nu*nu*nv; ii += str) {
58 for(
int l = 0; l < nu; l++){
59 tmp += A[
i+l*nv+pt*nv*nu]*
u[l+nu*
j+
e*nu*nu*nu];
66 for (
int ijk = idx; ijk< nu*nv*nv; ijk += str) {
67 const int jk = ijk / nv;
68 const int i = ijk - jk * nv;
69 const int k = jk / nv;
70 const int j = jk - k * nv;
72 const int ik2 =
i + k*nv*nu;
73 for(
int l = 0; l < nu; l++){
74 tmp += Bt[l+
j*nu+pt*nv*nu]*shwork[l*nv+ik2];
81 for (
int ijk = idx; ijk< nv*nv*nv; ijk += str) {
82 const int jk = ijk / nv;
83 const int i = ijk - jk * nv;
84 const int k = jk / nv;
85 const int j = jk - k * nv;
87 const int ij2 =
i +
j*nv;
88 for(
int l = 0; l < nu; l++){
89 tmp += Ct[l+k*nu+pt*nv*nu]*shwork2[ij2 + l*nv*nv];
91 v[ijk+pt*nv*nv*nv] = tmp;
97 template<
typename T, const
int N >
100 const T * __restrict__
u,
102 const T * __restrict__ A,
103 const T * __restrict__ Bt,
104 const T * __restrict__ Ct) {
105 __shared__ T shwork[N*N*N];
106 __shared__ T shwork2[N*N*N];
108 const int idx = threadIdx.x;
109 const int str = blockDim.x;
110 const int e = blockIdx.x;
112 for (
int ii = idx; ii< nu*nu*nv; ii += str) {
116 for(
int l = 0; l < nu; l++){
117 tmp += A[
i+l*nv]*
u[l+nu*
j+
e*nu*nu*nu];
124 for (
int ijk = idx; ijk< nu*nv*nv; ijk += str) {
125 const int jk = ijk / nv;
126 const int i = ijk - jk * nv;
127 const int k = jk / nv;
128 const int j = jk - k * nv;
130 const int ik2 =
i + k*nv*nu;
131 for(
int l = 0; l < nu; l++){
132 tmp += Bt[l+
j*nu]*shwork[l*nv+ik2];
139 for (
int ijk = idx; ijk< nv*nv*nv; ijk += str) {
140 const int jk = ijk / nv;
141 const int i = ijk - jk * nv;
142 const int k = jk / nv;
143 const int j = jk - k * nv;
145 const int ij2 =
i +
j*nv;
146 for(
int l = 0; l < nu; l++){
147 tmp += Ct[l+k*nu]*shwork2[ij2 + l*nv*nv];
149 v[ijk+
e*nv*nv*nv] = tmp;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void tnsr3d_el_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct, const int *elements, const int n_points)
__global__ void tnsr3d_kernel(T *__restrict__ v, const int nv, const T *__restrict__ u, const int nu, const T *__restrict__ A, const T *__restrict__ Bt, const T *__restrict__ Ct)