1 #ifndef __MATH_LAMBDA2_KERNEL_H__
2 #define __MATH_LAMBDA2_KERNEL_H__
44 T grad21, T grad22, T grad23,
45 T grad31, T grad32, T grad33 ) {
49 T s12 = 0.5*(grad12+grad21);
50 T s13 = 0.5*(grad13+grad31);
51 T s23 = 0.5*(grad23+grad32);
53 T o12 = 0.5*(grad12-grad21);
54 T o13 = 0.5*(grad13-grad31);
55 T o23 = 0.5*(grad23-grad32);
57 T a11 = s11*s11 + s12*s12 + s13*s13 - o12*o12 - o13*o13;
58 T a12 = s11 * s12 + s12 * s22 + s13 * s23 - o13 * o23;
59 T a13 = s11 * s13 + s12 * s23 + s13 * s33 + o12 * o23;
61 T a22 = s12*s12 + s22*s22 + s23*s23 - o12*o12 - o23*o23;
62 T a23 = s12 * s13 + s22 * s23 + s23 * s33 - o12 * o13;
63 T a33 = s13*s13 + s23*s23 + s33*s33 - o13*o13 - o23*o23;
66 T B = -(a11 + a22 + a33);
67 T C = -(a12*a12 + a13*a13 + a23*a23 - a11 * a22 - a11 * a33 - a22 * a33);
68 T D = -(2.0 * a12 * a13 * a23 - a11 * a23*a23 - a22 * a13*a13
69 - a33 * a12*a12 + a11 * a22 * a33);
72 T q = (3.0 * C - B*B) / 9.0;
73 T r = (9.0 * C * B - 27.0 * D - 2.0 * B*B*B) / 54.0;
75 T theta = acos( r / sqrt(-q*q*q) );
77 T eigen1 = 2.0 * sqrt(-q) * cos(theta / 3.0) - B / 3.0;
78 T eigen2 = 2.0 * sqrt(-q) * cos((theta + 2.0 *
pi) / 3.0) - B / 3.0;
79 T eigen3 = 2.0 * sqrt(-q) * cos((theta + 4.0 *
pi) / 3.0) - B / 3.0;
81 if (eigen1 <= eigen2 && eigen2 <= eigen3)
83 else if (eigen3 <= eigen2 && eigen2 <= eigen1)
85 else if (eigen1 <= eigen3 && eigen3 <= eigen2)
87 else if (eigen2 <= eigen3 && eigen3 <= eigen1)
89 else if (eigen2 <= eigen1 && eigen1 <= eigen3)
91 else if (eigen3 <= eigen1 && eigen1 <= eigen2)
97 template<
typename T, const
int LX, const
int CHUNKS >
99 const T * __restrict__
u,
100 const T * __restrict__
v,
101 const T * __restrict__
w,
102 const T * __restrict__
dx,
103 const T * __restrict__
dy,
104 const T * __restrict__
dz,
105 const T * __restrict__
drdx,
106 const T * __restrict__
dsdx,
107 const T * __restrict__
dtdx,
108 const T * __restrict__
drdy,
109 const T * __restrict__
dsdy,
110 const T * __restrict__
dtdy,
111 const T * __restrict__
drdz,
112 const T * __restrict__
dsdz,
113 const T * __restrict__
dtdz,
114 const T * __restrict__
jacinv) {
116 __shared__ T
shu[LX * LX * LX];
117 __shared__ T
shv[LX * LX * LX];
118 __shared__ T
shw[LX * LX * LX];
120 __shared__ T
shdx[LX * LX];
121 __shared__ T
shdy[LX * LX];
122 __shared__ T
shdz[LX * LX];
129 const int e = blockIdx.x;
130 const int ele = blockIdx.x*LX*LX*LX;
131 const int iii = threadIdx.x;
132 const int nchunks = (LX * LX * LX - 1) / CHUNKS + 1;
135 if (iii < (LX * LX)) {
142 while(
j < (LX * LX * LX)) {
151 for (
int n = 0; n < nchunks; n++) {
152 const int ijk = iii + n * CHUNKS;
153 const int jk = ijk / LX;
157 if (
i < LX &&
j < LX && k < LX ) {
169 for (
int l = 0; l < LX; l++) {
170 rtmpu +=
shdx[
i + l * LX] *
shu[l +
j * LX + k * LX * LX];
171 stmpu +=
shdy[
j + l * LX] *
shu[
i + l * LX + k * LX * LX];
172 ttmpu +=
shdz[k + l * LX] *
shu[
i +
j * LX + l * LX * LX];
174 rtmpv +=
shdx[
i + l * LX] *
shv[l +
j * LX + k * LX * LX];
175 stmpv +=
shdy[
j + l * LX] *
shv[
i + l * LX + k * LX * LX];
176 ttmpv +=
shdz[k + l * LX] *
shv[
i +
j * LX + l * LX * LX];
178 rtmpw +=
shdx[
i + l * LX] *
shw[l +
j * LX + k * LX * LX];
179 stmpw +=
shdy[
j + l * LX] *
shw[
i + l * LX + k * LX * LX];
180 ttmpw +=
shdz[k + l * LX] *
shw[
i +
j * LX + l * LX * LX];
230 lambda2[ijk +
e*LX*LX*LX] = eigen_val_calc<T>( grad11, grad12, grad13,
231 grad21, grad22, grad23,
232 grad31, grad32, grad33);
238 template<
typename T, const
int LX >
240 lambda2_kernel_kstep(T * __restrict__
lambda2,
241 const T * __restrict__
u,
242 const T * __restrict__
v,
243 const T * __restrict__
w,
244 const T * __restrict__
dx,
245 const T * __restrict__
dy,
246 const T * __restrict__
dz,
258 __shared__ T
shu[LX * LX];
259 __shared__ T
shv[LX * LX];
260 __shared__ T
shw[LX * LX];
266 const int e = blockIdx.x;
267 const int j = threadIdx.y;
268 const int i = threadIdx.x;
281 for (
int k = 0; k < LX; ++k) {
290 for (
int k = 0; k < LX; ++k) {
291 const int ijk =
ij + k*LX*LX;
299 for (
int l = 0; l < LX; l++) {
300 ttmpu +=
shdz[k+l*LX] *
ru[l];
301 ttmpv +=
shdz[k+l*LX] *
rv[l];
302 ttmpw +=
shdz[k+l*LX] *
rw[l];
313 for (
int l = 0; l < LX; l++) {
322 T grad11 = jinv * (
drdx[ijk +
ele] * rtmpu
326 T grad12 = jinv * (
drdy[ijk +
ele] * rtmpu
330 T grad13 = jinv * (
drdz[ijk +
ele] * rtmpu
333 T grad21 = jinv * (
drdx[ijk +
ele] * rtmpv
337 T grad22 = jinv * (
drdy[ijk +
ele] * rtmpv
341 T grad23 = jinv * (
drdz[ijk +
ele] * rtmpv
344 T grad31 = jinv * (
drdx[ijk +
ele] * rtmpw
348 T grad32 = jinv * (
drdy[ijk +
ele] * rtmpw
352 T grad33 = jinv * (
drdz[ijk +
ele] * rtmpw
355 lambda2[ijk +
ele] = eigen_val_calc<T>( grad11, grad12, grad13,
356 grad21, grad22, grad23,
357 grad31, grad32, grad33);
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void lambda2_kernel_1d(T *__restrict__ lambda2, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ drdx, const T *__restrict__ dsdx, const T *__restrict__ dtdx, const T *__restrict__ drdy, const T *__restrict__ dsdy, const T *__restrict__ dtdy, const T *__restrict__ drdz, const T *__restrict__ dsdz, const T *__restrict__ dtdz, const T *__restrict__ jacinv)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__shared__ T shdz[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void __launch_bounds__(LX *LX, 3) lambda2_kernel_kstep(T *__restrict__ lambda2
__shared__ T shdy[LX *LX]
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__inline__ __device__ T eigen_val_calc(T grad11, T grad12, T grad13, T grad21, T grad22, T grad23, T grad31, T grad32, T grad33)
__shared__ T shdx[LX *LX]
__global__ void const T *__restrict__ u
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void const T *__restrict__ const T *__restrict__ v
A simulation component that computes lambda2 The values are stored in the field registry under the na...
real(kind=rp), parameter, public pi