1 #ifndef __MATH_FDM_KERNEL_H__
2 #define __MATH_FDM_KERNEL_H__
37 template<
typename T, const
int NL >
42 __shared__ T shwork[NL*NL*NL];
43 __shared__ T shwork2[NL*NL*NL];
44 __shared__ T A[NL*NL];
45 __shared__ T Bt[NL*NL];
46 __shared__ T Ct[NL*NL];
48 const int idx = threadIdx.x;
49 const int str = blockDim.x;
50 const int el = blockIdx.x;
52 A[idx] = s[idx+NL*NL+el*NL*NL*3*2];
53 Bt[idx] = s[idx+2*NL*NL+el*NL*NL*3*2];
54 Ct[idx] = s[idx+2*2*NL*NL+el*NL*NL*3*2];
58 for (
int ii = idx; ii< NL*NL*NL; ii += str){
62 for(
int l = 0; l < NL; l++){
63 tmp += A[
i+l*NL]*r[l+NL*
j+el*NL*NL*NL];
68 for (
int ijk = idx; ijk< NL*NL*NL; ijk += str){
69 const int jk = ijk / NL;
70 const int i = ijk - jk * NL;
71 const int k = jk / NL;
72 const int j = jk - k * NL;
74 const int ik2 =
i + k*NL*NL;
75 for(
int l = 0; l < NL; l++){
76 tmp += Bt[l+
j*NL]*shwork[l*NL+ik2];
81 for (
int ijk = idx; ijk< NL*NL*NL; ijk += str){
82 const int jk = ijk / NL;
83 const int i = ijk - jk * NL;
84 const int k = jk / NL;
85 const int j = jk - k * NL;
87 const int ij2 =
i +
j*NL;
88 for(
int l = 0; l < NL; l++){
89 tmp += Ct[l+k*NL]*shwork2[ij2 + l*NL*NL];
91 r[ijk+el*NL*NL*NL] = tmp*d[ijk+el*NL*NL*NL];
95 A[idx] = s[idx+el*NL*NL*3*2];
96 Bt[idx] = s[idx+NL*NL+2*NL*NL+el*NL*NL*3*2];
97 Ct[idx] = s[idx+NL*NL+2*2*NL*NL+el*NL*NL*3*2];
101 for (
int ii = idx; ii< NL*NL*NL; ii += str){
105 for(
int l = 0; l < NL; l++){
106 tmp += A[
i+l*NL]*r[l+NL*
j+el*NL*NL*NL];
111 for (
int ijk = idx; ijk< NL*NL*NL; ijk += str){
112 const int jk = ijk / NL;
113 const int i = ijk - jk * NL;
114 const int k = jk / NL;
115 const int j = jk - k * NL;
117 const int ik2 =
i + k*NL*NL;
118 for(
int l = 0; l < NL; l++){
119 tmp += Bt[l+
j*NL]*shwork[l*NL+ik2];
124 for (
int ijk = idx; ijk< NL*NL*NL; ijk += str){
125 const int jk = ijk / NL;
126 const int i = ijk - jk * NL;
127 const int k = jk / NL;
128 const int j = jk - k * NL;
130 const int ij2 =
i +
j*NL;
131 for(
int l = 0; l < NL; l++){
132 tmp += Ct[l+k*NL]*shwork2[ij2 + l*NL*NL];
134 e[ijk+el*NL*NL*NL] = tmp;
__global__ void fdm_do_fast_kernel(T *__restrict__ e, T *__restrict__ r, T *__restrict__ s, T *__restrict__ d)