48 void *
dr,
void *
ds,
void *
dt,
49 void *
dx,
void *
dy,
void *
dz,
50 void *
jacinv,
int *nel,
int *lx);
58 void *
dr,
void *
ds,
void *
dt,
59 void *
dx,
void *
dy,
void *
dz,
60 void *
jacinv,
int *nel,
int *lx) {
62 static int autotune[16] = { 0 };
64 const dim3 nthrds_1d(1024, 1, 1);
65 const dim3 nthrds_kstep((*lx), (*lx), 1);
66 const dim3 nblcks((*nel), 1, 1);
70 dudxyz_kernel_1d<real, LX, 1024> \
71 <<<nblcks, nthrds_1d, 0, stream>>>((real *) du, (real *) u, \
72 (real *) dr, (real *) ds, (real *) dt, \
73 (real *) dx, (real *) dy, (real *) dz, \
75 CUDA_CHECK(cudaGetLastError());
77 #define CASE_KSTEP(LX) \
78 dudxyz_kernel_kstep<real, LX> \
79 <<<nblcks, nthrds_kstep, 0, stream>>>((real *) du, (real *) u, \
80 (real *) dr, (real *) ds, (real *) dt, \
81 (real *) dx, (real *) dy, (real *) dz, \
83 CUDA_CHECK(cudaGetLastError());
87 if(autotune[LX] == 0 ) { \
88 autotune[LX]=tune_dudxyz<LX>(du, u, \
92 } else if (autotune[LX] == 1 ) { \
94 } else if (autotune[LX] == 2 ) { \
99 #define CASE_LARGE(LX) \
118 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
133 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
141 template < const
int LX >
143 void *
dr,
void *
ds,
void *
dt,
144 void *
dx,
void *
dy,
void *
dz,
145 void *
jacinv,
int *nel,
int *lx) {
146 cudaEvent_t start,stop;
150 const dim3 nthrds_1d(1024, 1, 1);
151 const dim3 nthrds_kstep((*lx), (*lx), 1);
152 const dim3 nblcks((*nel), 1, 1);
155 char *env_value = NULL;
156 char neko_log_buf[80];
158 env_value=getenv(
"NEKO_AUTOTUNE");
160 sprintf(neko_log_buf,
"Autotune dudxyz (lx: %d)", *lx);
164 if( !strcmp(env_value,
"1D") ) {
166 sprintf(neko_log_buf,
"Set by env : 1 (1D)");
170 }
else if( !strcmp(env_value,
"KSTEP") ) {
172 sprintf(neko_log_buf,
"Set by env : 2 (KSTEP)");
177 sprintf(neko_log_buf,
"Invalid value set for NEKO_AUTOTUNE");
182 cudaEventCreate(&start);
183 cudaEventCreate(&stop);
185 cudaEventRecord(start,0);
187 for(
int i = 0;
i < 100;
i++) {
191 cudaEventRecord(stop,0);
192 cudaEventSynchronize(stop);
193 cudaEventElapsedTime(&time1, start, stop);
195 cudaEventRecord(start,0);
197 for(
int i = 0;
i < 100;
i++) {
201 cudaEventRecord(stop,0);
202 cudaEventSynchronize(stop);
203 cudaEventElapsedTime(&time2, start, stop);
211 sprintf(neko_log_buf,
"Chose : %d (%s)", retval,
212 (retval > 1 ?
"KSTEP" :
"1D"));
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
void log_error(char *msg)
void log_message(char *msg)
void log_section(char *msg)
void cuda_dudxyz(void *du, void *u, void *dr, void *ds, void *dt, void *dx, void *dy, void *dz, void *jacinv, int *nel, int *lx)
int tune_dudxyz(void *du, void *u, void *dr, void *ds, void *dt, void *dx, void *dy, void *dz, void *jacinv, int *nel, int *lx)