50#include "conv1_kernel.cl.h" 
   58                  void *
vx, 
void *
vy, 
void *
vz,
 
   59                  void *
dx, 
void *
dy, 
void *
dz,
 
   63                  void *
jacinv, 
int *nel, 
int *gdim, 
int *lx) {
 
   85#define CASE_1D(LX, QUEUE, EVENT)                                               \ 
   87      cl_kernel kernel = clCreateKernel(conv1_program,                          \ 
   88                                        STR(conv1_kernel_lx##LX), &err);        \ 
   91      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du));        \ 
   92      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u));         \ 
   93      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &vx));        \ 
   94      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &vy));        \ 
   95      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &vz));        \ 
   96      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx));        \ 
   97      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy));        \ 
   98      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz));        \ 
   99      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &drdx));      \ 
  100      CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dsdx));      \ 
  101      CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &dtdx));     \ 
  102      CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &drdy));     \ 
  103      CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dsdy));     \ 
  104      CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &dtdy));     \ 
  105      CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &drdz));     \ 
  106      CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dsdz));     \ 
  107      CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &dtdz));     \ 
  108      CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_mem), (void *) &jacinv));   \ 
  110      CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                 \ 
  111                                      kernel, 1, NULL, &global_item_size,       \ 
  112                                      &local_item_size, 0, NULL, EVENT));       \ 
  115#define CASE_KSTEP(LX, QUEUE, EVENT)                                            \ 
  117      cl_kernel kernel = clCreateKernel(conv1_program,                          \ 
  118                                        STR(conv1_kernel_kstep_lx##LX), &err);  \ 
  121      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du));        \ 
  122      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u));         \ 
  123      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &vx));        \ 
  124      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &vy));        \ 
  125      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &vz));        \ 
  126      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx));        \ 
  127      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy));        \ 
  128      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz));        \ 
  129      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &drdx));      \ 
  130      CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dsdx));      \ 
  131      CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &dtdx));     \ 
  132      CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &drdy));     \ 
  133      CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dsdy));     \ 
  134      CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &dtdy));     \ 
  135      CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &drdz));     \ 
  136      CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dsdz));     \ 
  137      CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &dtdz));     \ 
  138      CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_mem), (void *) &jacinv));   \ 
  140      CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                 \ 
  141                                      kernel, 2, NULL, global_kstep,            \ 
  142                                      local_kstep, 0, NULL, EVENT));            \ 
  148      if(autotune_conv1[LX] == 0 ) {                                            \ 
  149        char *env_value = NULL;                                                 \ 
  150        char neko_log_buf[80];                                                  \ 
  151        env_value = getenv("NEKO_AUTOTUNE");                                    \ 
  153        sprintf(neko_log_buf, "Autotune conv1 (lx: %d)", *lx);                  \ 
  154        log_section(neko_log_buf);                                              \ 
  156          if( !strcmp(env_value,"1D") ) {                                       \ 
  157            CASE_1D(LX, glb_cmd_queue, NULL);                                   \ 
  158            sprintf(neko_log_buf,"Set by env : 1 (1D)");                        \ 
  159            log_message(neko_log_buf);                                          \ 
  160            autotune_conv1[LX] = 1;                                             \ 
  161          } else if( !strcmp(env_value,"KSTEP") ) {                             \ 
  162            CASE_KSTEP(LX, glb_cmd_queue, NULL);                                \ 
  163            sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");                     \ 
  164            log_message(neko_log_buf);                                          \ 
  165            autotune_conv1[LX] = 2;                                             \ 
  167            sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");       \ 
  168            log_error(neko_log_buf);                                            \ 
  172          CL_CHECK(clFinish(glb_cmd_queue));                                    \ 
  173          cl_event perf_event, sync_event;                                      \ 
  174          cl_ulong start, end;                                                  \ 
  175          CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event));                \ 
  176          CL_CHECK(clEnqueueBarrier(prf_cmd_queue));                            \ 
  177          CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event));      \ 
  179          double elapsed1 = 0.0;                                                \ 
  180          for(int i = 0; i < 100; i++) {                                        \ 
  181            CASE_1D(LX, prf_cmd_queue, &perf_event);                            \ 
  182            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  183            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  184                                             CL_PROFILING_COMMAND_START,        \ 
  185                                             sizeof(cl_ulong), &start, NULL));  \ 
  186            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  187                                             CL_PROFILING_COMMAND_END,          \ 
  188                                             sizeof(cl_ulong), &end, NULL));    \ 
  189            elapsed1 += (end - start)*1.0e-6;                                   \ 
  192          double elapsed2 = 0.0;                                                \ 
  193          for(int i = 0; i < 100; i++) {                                        \ 
  194            CASE_KSTEP(LX, prf_cmd_queue, &perf_event);                         \ 
  195            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  196            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  197                                             CL_PROFILING_COMMAND_START,        \ 
  198                                             sizeof(cl_ulong), &start, NULL));  \ 
  199            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  200                                             CL_PROFILING_COMMAND_END,          \ 
  201                                             sizeof(cl_ulong), &end, NULL));    \ 
  202            elapsed2 += (end - start)*1.0e-6;                                   \ 
  205          CL_CHECK(clFinish(prf_cmd_queue));                                    \ 
  206          CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event));                \ 
  207          int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2);                      \ 
  208          sprintf(neko_log_buf, "Chose      : %d (%s)", krnl_strtgy,            \ 
  209                  (krnl_strtgy > 1 ? "KSTEP" : "1D"));                          \ 
  210          autotune_conv1[LX] = krnl_strtgy;                                     \ 
  211          log_message(neko_log_buf);                                            \ 
  212          clEnqueueBarrier(glb_cmd_queue);                                      \ 
  213          clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ;               \ 
  216      } else if (autotune_conv1[LX] == 1 ) {                                    \ 
  217        CASE_1D(LX, glb_cmd_queue, NULL);                                       \ 
  218      } else if (autotune_conv1[LX] == 2 ) {                                    \ 
  219        CASE_KSTEP(LX, glb_cmd_queue, NULL);                                    \ 
  223#define CASE_LARGE(LX)                                                          \ 
  225      CASE_KSTEP(LX, glb_cmd_queue, NULL);                                      \ 
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ vz
 
__global__ void const T *__restrict__ const T *__restrict__ vx
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ vy
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void opencl_kernel_jit(const char *kernel, cl_program *program)
 
void opencl_conv1(void *du, void *u, void *vx, void *vy, void *vz, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *gdim, int *lx)