46 template < const
int >
48 void *
dr,
void *
ds,
void *
dt,
50 void *
w3,
int *nel,
int *lx);
58 void *
dr,
void *
ds,
void *
dt,
60 void *
w3,
int *nel,
int *lx) {
62 static int autotune[17] = { 0 };
64 const dim3 nthrds_1d(1024, 1, 1);
65 const dim3 nthrds_kstep((*lx), (*lx), 1);
66 const dim3 nblcks((*nel), 1, 1);
70 cdtp_kernel_1d<real, LX, 1024> \
71 <<<nblcks, nthrds_1d, 0, stream>>>((real *) dtx, (real *) x, \
72 (real *) dr, (real *) ds, (real *) dt, \
73 (real *) dxt, (real *) dyt, (real *) dzt, \
75 CUDA_CHECK(cudaGetLastError());
77 #define CASE_KSTEP(LX) \
78 cdtp_kernel_kstep<real, LX> \
79 <<<nblcks, nthrds_kstep, 0, stream>>>((real *) dtx, (real *) x, \
80 (real *) dr, (real *) ds, (real *) dt, \
81 (real *) dxt, (real *) dyt, (real *) dzt, \
83 CUDA_CHECK(cudaGetLastError());
87 if(autotune[LX] == 0 ) { \
88 autotune[LX]=tune_cdtp<LX>(dtx, x, \
92 } else if (autotune[LX] == 1 ) { \
94 } else if (autotune[LX] == 2 ) { \
99 #define CASE_LARGE(LX) \
120 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
133 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
141 template < const
int LX >
143 void *
dr,
void *
ds,
void *
dt,
145 void *
w3,
int *nel,
int *lx) {
146 cudaEvent_t start,stop;
150 const dim3 nthrds_1d(1024, 1, 1);
151 const dim3 nthrds_kstep((*lx), (*lx), 1);
152 const dim3 nblcks((*nel), 1, 1);
155 char *env_value = NULL;
156 char neko_log_buf[80];
158 env_value=getenv(
"NEKO_AUTOTUNE");
160 sprintf(neko_log_buf,
"Autotune cdtp (lx: %d)", *lx);
164 if( !strcmp(env_value,
"1D") ) {
166 sprintf(neko_log_buf,
"Set by env : 1 (1D)");
170 }
else if( !strcmp(env_value,
"KSTEP") ) {
172 sprintf(neko_log_buf,
"Set by env : 2 (KSTEP)");
177 sprintf(neko_log_buf,
"Invalid value set for NEKO_AUTOTUNE");
182 cudaEventCreate(&start);
183 cudaEventCreate(&stop);
185 cudaEventRecord(start,0);
187 for(
int i = 0;
i < 100;
i++) {
191 cudaEventRecord(stop,0);
192 cudaEventSynchronize(stop);
193 cudaEventElapsedTime(&time1, start, stop);
195 cudaEventRecord(start,0);
197 for(
int i = 0;
i < 100;
i++) {
201 cudaEventRecord(stop,0);
202 cudaEventSynchronize(stop);
203 cudaEventElapsedTime(&time2, start, stop);
211 sprintf(neko_log_buf,
"Chose : %d (%s)", retval,
212 (retval > 1 ?
"KSTEP" :
"1D"));
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
void log_error(char *msg)
void log_message(char *msg)
void log_section(char *msg)
int tune_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
void cuda_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)