50#include "ax_helm_kernel.cl.h" 
   60                    void *
g13, 
void *
g23, 
int *nelv, 
int *lx) {
 
   83#define CASE_1D(LX, QUEUE, EVENT)                                               \ 
   85      cl_kernel kernel = clCreateKernel(ax_helm_program,                        \ 
   86                                        STR(ax_helm_kernel_lx##LX), &err);      \ 
   89      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));         \ 
   90      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u));         \ 
   91      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx));        \ 
   92      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy));        \ 
   93      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz));        \ 
   94      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt));       \ 
   95      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt));       \ 
   96      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt));       \ 
   97      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &h1));        \ 
   98      CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g11));       \ 
   99      CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g22));      \ 
  100      CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g33));      \ 
  101      CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g12));      \ 
  102      CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g13));      \ 
  103      CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g23));      \ 
  105      CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                 \ 
  106                                     kernel, 1, NULL, &global_item_size,        \ 
  107                                     &local_item_size, 0, NULL, EVENT));        \ 
  112#define CASE_KSTEP(LX, QUEUE, EVENT)                                            \ 
  114      cl_kernel kernel = clCreateKernel(ax_helm_program,                        \ 
  115                                        STR(ax_helm_kernel_kstep_lx##LX), &err);\ 
  118      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));         \ 
  119      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u));         \ 
  120      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx));        \ 
  121      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy));        \ 
  122      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz));        \ 
  123      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &h1));        \ 
  124      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &g11));       \ 
  125      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &g22));       \ 
  126      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &g33));       \ 
  127      CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g12));       \ 
  128      CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g13));      \ 
  129      CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g23));      \ 
  131     CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                  \ 
  132                                     kernel, 2, NULL, global_kstep,             \ 
  133                                     local_kstep, 0, NULL, EVENT));             \ 
  140      if(autotune_ax_helm[LX] == 0 ) {                                          \ 
  141        char *env_value = NULL;                                                 \ 
  142        char neko_log_buf[80];                                                  \ 
  143        env_value = getenv("NEKO_AUTOTUNE");                                    \ 
  145        sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);                \ 
  146        log_section(neko_log_buf);                                              \ 
  148          if( !strcmp(env_value,"1D") ) {                                       \ 
  149            CASE_1D(LX, glb_cmd_queue, NULL);                                   \ 
  150            sprintf(neko_log_buf,"Set by env : 1 (1D)");                        \ 
  151            log_message(neko_log_buf);                                          \ 
  152            autotune_ax_helm[LX] = 1;                                           \ 
  153          } else if( !strcmp(env_value,"KSTEP") ) {                             \ 
  154            CASE_KSTEP(LX, glb_cmd_queue, NULL);                                \ 
  155            sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");                     \ 
  156            log_message(neko_log_buf);                                          \ 
  157            autotune_ax_helm[LX] = 2;                                           \ 
  159            sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");       \ 
  160            log_error(neko_log_buf);                                            \ 
  164          CL_CHECK(clFinish(glb_cmd_queue));                                    \ 
  165          cl_event perf_event, sync_event;                                      \ 
  166          cl_ulong start, end;                                                  \ 
  167          CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event));                \ 
  168          CL_CHECK(clEnqueueBarrier(prf_cmd_queue));                            \ 
  169          CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event));      \ 
  171          double elapsed1 = 0.0;                                                \ 
  172          for(int i = 0; i < 100; i++) {                                        \ 
  173            CASE_1D(LX, prf_cmd_queue, &perf_event);                            \ 
  174            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  175            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  176                                             CL_PROFILING_COMMAND_START,        \ 
  177                                             sizeof(cl_ulong), &start, NULL));  \ 
  178            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  179                                             CL_PROFILING_COMMAND_END,          \ 
  180                                             sizeof(cl_ulong), &end, NULL));    \ 
  181            elapsed1 += (end - start)*1.0e-6;                                   \ 
  184          double elapsed2 = 0.0;                                                \ 
  185          for(int i = 0; i < 100; i++) {                                        \ 
  186            CASE_KSTEP(LX, prf_cmd_queue, &perf_event);                         \ 
  187            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  188            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  189                                             CL_PROFILING_COMMAND_START,        \ 
  190                                             sizeof(cl_ulong), &start, NULL));  \ 
  191            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  192                                             CL_PROFILING_COMMAND_END,          \ 
  193                                             sizeof(cl_ulong), &end, NULL));    \ 
  194            elapsed2 += (end - start)*1.0e-6;                                   \ 
  197          CL_CHECK(clFinish(prf_cmd_queue));                                    \ 
  198          CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event));                \ 
  199          int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2);                      \ 
  200          sprintf(neko_log_buf, "Chose      : %d (%s)", krnl_strtgy,            \ 
  201                  (krnl_strtgy > 1 ? "KSTEP" : "1D"));                          \ 
  202          autotune_ax_helm[LX] = krnl_strtgy;                                   \ 
  203          log_message(neko_log_buf);                                            \ 
  204          clEnqueueBarrier(glb_cmd_queue);                                      \ 
  205          clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ;               \ 
  208      } else if (autotune_ax_helm[LX] == 1 ) {                                  \ 
  209        CASE_1D(LX, glb_cmd_queue, NULL);                                       \ 
  210      } else if (autotune_ax_helm[LX] == 2 ) {                                  \ 
  211        CASE_KSTEP(LX, glb_cmd_queue, NULL);                                    \ 
  215#define CASE_LARGE(LX)                                                          \ 
  217      CASE_KSTEP(LX, glb_cmd_queue, NULL);                                      \ 
 
  259                           void *
u, 
void *
v, 
void *
w,
 
  260                           void *
dx, 
void *
dy, 
void *
dz,
 
  263                           void *
g13, 
void *
g23, 
int *nelv, 
int *lx) {
 
  279#define CASE_VECTOR(LX)                                                         \ 
  283        clCreateKernel(ax_helm_program,                                         \ 
  284                       STR(ax_helm_kernel_vector_kstep_lx##LX), &err);          \ 
  287      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &au));        \ 
  288      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &av));        \ 
  289      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &aw));        \ 
  290      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u));         \ 
  291      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v));         \ 
  292      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &w));         \ 
  293      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dx));        \ 
  294      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dy));        \ 
  295      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dz));        \ 
  296      CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &h1));        \ 
  297      CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g11));      \ 
  298      CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g22));      \ 
  299      CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g33));      \ 
  300      CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g12));      \ 
  301      CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g13));      \ 
  302      CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &g23));      \ 
  304     CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue,          \ 
  305                                     kernel, 2, NULL, global_kstep,             \ 
  306                                     local_kstep, 0, NULL, NULL));              \ 
 
void opencl_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
 
void opencl_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
 
__global__ void T *__restrict__ T *__restrict__ aw
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
 
__global__ void T *__restrict__ av
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
 
void opencl_kernel_jit(const char *kernel, cl_program *program)