38 template<
typename T, const
int LX >
40 const T * __restrict__
dxt,
41 const T * __restrict__
dyt,
42 const T * __restrict__
dzt,
43 const T * __restrict__ G11,
44 const T * __restrict__ G22,
45 const T * __restrict__ G33,
46 const T * __restrict__ G12,
47 const T * __restrict__ G13,
48 const T * __restrict__ G23,
50 const int idx = threadIdx.x + blockIdx.x * blockDim.x;
51 const int e = idx / (LX*LX*LX);
52 const int ijk = idx -
e*LX*LX*LX;
53 const int jk = ijk / LX;
54 const int i = ijk - jk * LX;
55 const int k = jk / LX;
56 const int j = jk - k * LX;
63 for (
int l = 0; l < LX; l++) {
64 T g = G11[l + LX*
j + LX*LX*k + LX*LX*LX*
e];
69 for (
int l = 0; l < LX; l++) {
70 T g = G22[
i + LX*l + LX*LX*k + LX*LX*LX*
e];
75 for (
int l = 0; l < LX; l++) {
76 T g = G33[
i + LX*
j + LX*LX*l + LX*LX*LX*
e];
82 if (
i == 0 ||
i == LX-1) {
83 d += G12[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dxt[
i + LX*
i] *
dyt[
j + LX*
j];
84 d += G13[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dxt[
i + LX*
i] *
dzt[k + LX*k];
87 if (
j == 0 ||
j == LX-1) {
88 d += G12[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dyt[
j + LX*
j] *
dxt[
i + LX*
i];
89 d += G23[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dyt[
j + LX*
j] *
dzt[k + LX*k];
92 if (k == 0 || k == LX-1) {
93 d += G13[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dzt[k + LX*k] *
dxt[
i + LX*
i];
94 d += G23[
i + LX*
j + LX*LX*k + LX*LX*LX*
e] *
dzt[k + LX*k] *
dyt[
j + LX*
j];
103 void *G11,
void *G22,
void *G33,
104 void *G12,
void *G13,
void *G23,
105 int *nel,
int *lxp) {
108 const int threads = 1024;
109 const int blocks = ((*nel * lx*lx*lx) + threads - 1) / threads;
114 jacobi_kernel<real, N><<<blocks, threads, 0, stream>>>( \
116 (real*)dxt, (real*)dyt, (real*)dzt,\
117 (real*)G11, (real*)G22, (real*)G33,\
118 (real*)G12, (real*)G13, (real*)G23,\
139 fprintf(stderr, __FILE__
": size not supported: %d\n", lx);
142 cudaError_t err = cudaGetLastError();
143 if (err != cudaSuccess) {
144 fprintf(stderr, __FILE__
": %s\n", cudaGetErrorString(err));
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void jacobi_kernel(T *__restrict__ du, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ G11, const T *__restrict__ G22, const T *__restrict__ G33, const T *__restrict__ G12, const T *__restrict__ G13, const T *__restrict__ G23, const int nel)
void cuda_jacobi_update(void *d, void *dxt, void *dyt, void *dzt, void *G11, void *G22, void *G33, void *G12, void *G13, void *G23, int *nel, int *lxp)