50 void *
g13,
void *
g23,
int *nelv,
int *lx);
56 void *
g13,
void *
g23,
int *nelv,
int *lx);
66 void *
g13,
void *
g23,
int *nelv,
int *lx) {
68 static int autotune[17] = { 0 };
70 const dim3 nthrds_1d(1024, 1, 1);
71 const dim3 nblcks_1d((*nelv), 1, 1);
72 const dim3 nthrds_kstep((*lx), (*lx), 1);
73 const dim3 nblcks_kstep((*nelv), 1, 1);
75 const dim3 nthrds((*lx), (*lx), 1);
76 const dim3 nblcks((*nelv), 1, 1);
80 ax_helm_kernel_1d<real, LX, 1024> \
81 <<<nblcks_1d, nthrds_1d, 0, stream>>>((real *) w, (real *) u, \
82 (real *) dx, (real *) dy, (real *) dz, \
83 (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1,\
84 (real *) g11, (real *) g22, (real *) g33, \
85 (real *) g12, (real *) g13, (real *) g23); \
86 CUDA_CHECK(cudaGetLastError());
88 #define CASE_KSTEP(LX) \
89 ax_helm_kernel_kstep<real, LX> \
90 <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
91 (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
92 (real *) g11, (real *) g22, (real *) g33, \
93 (real *) g12, (real *) g13, (real *) g23); \
94 CUDA_CHECK(cudaGetLastError());
96 #define CASE_KSTEP_PADDED(LX) \
97 ax_helm_kernel_kstep_padded<real, LX> \
98 <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
99 (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
100 (real *) g11, (real *) g22, (real *) g33, \
101 (real *) g12, (real *) g13, (real *) g23); \
102 CUDA_CHECK(cudaGetLastError());
106 if(autotune[LX] == 0 ) { \
107 autotune[LX]=tune<LX>( w, u, \
111 g12, g13, g23, nelv, lx); \
112 } else if (autotune[LX] == 1 ) { \
114 } else if (autotune[LX] == 2 ) { \
120 #define CASE_PADDED(LX) \
122 if(autotune[LX] == 0 ) { \
123 autotune[LX]=tune_padded<LX>(w, u, \
127 g12, g13, g23,nelv,lx); \
128 } else if (autotune[LX] == 1 ) { \
130 } else if (autotune[LX] == 2 ) { \
131 CASE_KSTEP_PADDED(LX); \
135 #define CASE_LARGE(LX) \
140 #define CASE_LARGE_PADDED(LX) \
142 CASE_KSTEP_PADDED(LX); \
160 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
174 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
185 void *
u,
void *
v,
void *
w,
186 void *
dx,
void *
dy,
void *
dz,
190 void *
g23,
int *nelv,
int *lx) {
192 const dim3 nthrds((*lx), (*lx), 1);
193 const dim3 nblcks((*nelv), 1, 1);
196 #define CASE_VECTOR_KSTEP(LX) \
197 ax_helm_kernel_vector_kstep<real, LX> \
198 <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
199 (real *) u, (real *) v, (real *) w, \
200 (real *) dx, (real *) dy, (real *) dz, \
201 (real *) h1, (real *) g11, (real *) g22, \
202 (real *) g33, (real *) g12, (real *) g13, \
204 CUDA_CHECK(cudaGetLastError());
206 #define CASE_VECTOR_KSTEP_PADDED(LX) \
207 ax_helm_kernel_vector_kstep_padded<real, LX> \
208 <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
209 (real *) u, (real *) v, (real *) w, \
210 (real *) dx, (real *) dy, (real *) dz, \
211 (real *) h1, (real *) g11, (real *) g22, \
212 (real *) g33, (real *) g12, (real *) g13, \
214 CUDA_CHECK(cudaGetLastError());
217 #define CASE_VECTOR(LX) \
219 CASE_VECTOR_KSTEP(LX); \
222 #define CASE_VECTOR_PADDED(LX) \
224 CASE_VECTOR_KSTEP_PADDED(LX); \
245 fprintf(stderr, __FILE__
": size not supported: %d\n", *lx);
255 void *
u,
void *
v,
void *
w,
256 void *h2,
void *B,
int *n) {
258 const dim3 nthrds(1024, 1, 1);
259 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
262 ax_helm_kernel_vector_part2<real>
269 template < const
int LX >
273 void *
g13,
void *
g23,
int *nelv,
int *lx) {
274 cudaEvent_t start,stop;
278 const dim3 nthrds_1d(1024, 1, 1);
279 const dim3 nblcks_1d((*nelv), 1, 1);
280 const dim3 nthrds_kstep((*lx), (*lx), 1);
281 const dim3 nblcks_kstep((*nelv), 1, 1);
284 char *env_value = NULL;
285 char neko_log_buf[80];
287 env_value=getenv(
"NEKO_AUTOTUNE");
289 sprintf(neko_log_buf,
"Autotune Ax helm (lx: %d)", *lx);
293 if( !strcmp(env_value,
"1D") ) {
295 sprintf(neko_log_buf,
"Set by env : 1 (1D)");
299 }
else if( !strcmp(env_value,
"KSTEP") ) {
301 sprintf(neko_log_buf,
"Set by env : 2 (KSTEP)");
306 sprintf(neko_log_buf,
"Invalid value set for NEKO_AUTOTUNE");
311 cudaEventCreate(&start);
312 cudaEventCreate(&stop);
314 cudaEventRecord(start,0);
316 for(
int i = 0;
i < 100;
i++) {
320 cudaEventRecord(stop,0);
321 cudaEventSynchronize(stop);
322 cudaEventElapsedTime(&time1, start, stop);
324 cudaEventRecord(start,0);
326 for(
int i = 0;
i < 100;
i++) {
330 cudaEventRecord(stop,0);
331 cudaEventSynchronize(stop);
332 cudaEventElapsedTime(&time2, start, stop);
340 sprintf(neko_log_buf,
"Chose : %d (%s)", retval,
341 (retval > 1 ?
"KSTEP" :
"1D"));
347 template < const
int LX >
351 void *
g13,
void *
g23,
int *nelv,
int *lx) {
352 cudaEvent_t start, stop;
356 const dim3 nthrds_1d(1024, 1, 1);
357 const dim3 nblcks_1d((*nelv), 1, 1);
358 const dim3 nthrds_kstep((*lx), (*lx), 1);
359 const dim3 nblcks_kstep((*nelv), 1, 1);
362 char *env_value = NULL;
363 char neko_log_buf[80];
365 env_value=getenv(
"NEKO_AUTOTUNE");
367 sprintf(neko_log_buf,
"Autotune Ax helm (lx: %d)", *lx);
371 if( !strcmp(env_value,
"1D") ) {
373 sprintf(neko_log_buf,
"Set by env : 1 (1D)");
377 }
else if( !strcmp(env_value,
"KSTEP") ) {
379 sprintf(neko_log_buf,
"Set by env : 2 (KSTEP)");
384 sprintf(neko_log_buf,
"Invalid value set for NEKO_AUTOTUNE");
389 cudaEventCreate(&start);
390 cudaEventCreate(&stop);
392 cudaEventRecord(start,0);
394 for(
int i = 0;
i < 100;
i++) {
398 cudaEventRecord(stop, 0);
399 cudaEventSynchronize(stop);
400 cudaEventElapsedTime(&time1, start, stop);
402 cudaEventRecord(start, 0);
404 for(
int i = 0;
i < 100;
i++) {
408 cudaEventRecord(stop, 0);
409 cudaEventSynchronize(stop);
410 cudaEventElapsedTime(&time2, start, stop);
418 sprintf(neko_log_buf,
"Chose : %d (%s)", retval,
419 (retval > 1 ?
"KSTEP" :
"1D"));
void cuda_ax_helm_vector_part2(void *au, void *av, void *aw, void *u, void *v, void *w, void *h2, void *B, int *n)
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
void cuda_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
#define CASE_KSTEP_PADDED(LX)
#define CASE_LARGE_PADDED(LX)
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
#define CASE_VECTOR_PADDED(LX)
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void log_error(char *msg)
void log_message(char *msg)
void log_section(char *msg)