50#include "cdtp_kernel.cl.h" 
   58                 void *
dr, 
void *
ds, 
void *
dt,
 
   60                 void *
w3, 
int *nel, 
int *lx) {
 
   82#define CASE_1D(LX, QUEUE, EVENT)                                               \ 
   84      cl_kernel kernel = clCreateKernel(cdtp_program,                           \ 
   85                                        STR(cdtp_kernel_lx##LX), &err);         \ 
   88      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx));       \ 
   89      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x));         \ 
   90      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr));        \ 
   91      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds));        \ 
   92      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt));        \ 
   93      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt));       \ 
   94      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt));       \ 
   95      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt));       \ 
   96      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));        \ 
   98      CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                 \ 
   99                                      kernel, 1, NULL, &global_item_size,       \ 
  100                                      &local_item_size, 0, NULL, EVENT));       \ 
  104#define CASE_KSTEP(LX, QUEUE, EVENT)                                            \ 
  106      cl_kernel kernel = clCreateKernel(cdtp_program,                           \ 
  107                                        STR(cdtp_kernel_kstep_lx##LX), &err);   \ 
  110      CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx));       \ 
  111      CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x));         \ 
  112      CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr));        \ 
  113      CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds));        \ 
  114      CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt));        \ 
  115      CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt));       \ 
  116      CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt));       \ 
  117      CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt));       \ 
  118      CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));        \ 
  120      CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE,                 \ 
  121                                      kernel, 2, NULL, global_kstep,            \ 
  122                                      local_kstep, 0, NULL, EVENT));            \ 
  128      if(autotune_cdtp[LX] == 0 ) {                                             \ 
  129        char *env_value = NULL;                                                 \ 
  130        char neko_log_buf[80];                                                  \ 
  131        env_value = getenv("NEKO_AUTOTUNE");                                    \ 
  133        sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx);                   \ 
  134        log_section(neko_log_buf);                                              \ 
  136          if( !strcmp(env_value,"1D") ) {                                       \ 
  137            CASE_1D(LX, glb_cmd_queue, NULL);                                   \ 
  138            sprintf(neko_log_buf,"Set by env : 1 (1D)");                        \ 
  139            log_message(neko_log_buf);                                          \ 
  140            autotune_cdtp[LX] = 1;                                              \ 
  141          } else if( !strcmp(env_value,"KSTEP") ) {                             \ 
  142            CASE_KSTEP(LX, glb_cmd_queue, NULL);                                \ 
  143            sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");                     \ 
  144            log_message(neko_log_buf);                                          \ 
  145            autotune_cdtp[LX] = 2;                                              \ 
  147            sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");       \ 
  148            log_error(neko_log_buf);                                            \ 
  152          CL_CHECK(clFinish(glb_cmd_queue));                                    \ 
  153          cl_event perf_event, sync_event;                                      \ 
  154          cl_ulong start, end;                                                  \ 
  155          CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event));                \ 
  156          CL_CHECK(clEnqueueBarrier(prf_cmd_queue));                            \ 
  157          CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event));      \ 
  159          double elapsed1 = 0.0;                                                \ 
  160          for(int i = 0; i < 100; i++) {                                        \ 
  161            CASE_1D(LX, prf_cmd_queue, &perf_event);                            \ 
  162            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  163            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  164                                             CL_PROFILING_COMMAND_START,        \ 
  165                                             sizeof(cl_ulong), &start, NULL));  \ 
  166            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  167                                             CL_PROFILING_COMMAND_END,          \ 
  168                                             sizeof(cl_ulong), &end, NULL));    \ 
  169            elapsed1 += (end - start)*1.0e-6;                                   \ 
  172          double elapsed2 = 0.0;                                                \ 
  173          for(int i = 0; i < 100; i++) {                                        \ 
  174            CASE_KSTEP(LX, prf_cmd_queue, &perf_event);                         \ 
  175            CL_CHECK(clWaitForEvents(1, &perf_event));                          \ 
  176            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  177                                             CL_PROFILING_COMMAND_START,        \ 
  178                                             sizeof(cl_ulong), &start, NULL));  \ 
  179            CL_CHECK(clGetEventProfilingInfo(perf_event,                        \ 
  180                                             CL_PROFILING_COMMAND_END,          \ 
  181                                             sizeof(cl_ulong), &end, NULL));    \ 
  182            elapsed2 += (end - start)*1.0e-6;                                   \ 
  185          CL_CHECK(clFinish(prf_cmd_queue));                                    \ 
  186          CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event));                \ 
  187          int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2);                      \ 
  188          sprintf(neko_log_buf, "Chose      : %d (%s)", krnl_strtgy,            \ 
  189                  (krnl_strtgy > 1 ? "KSTEP" : "1D"));                          \ 
  190          autotune_cdtp[LX] = krnl_strtgy;                                      \ 
  191          log_message(neko_log_buf);                                            \ 
  192          clEnqueueBarrier(glb_cmd_queue);                                      \ 
  193          clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ;               \ 
  196      } else if (autotune_cdtp[LX] == 1 ) {                                     \ 
  197        CASE_1D(LX, glb_cmd_queue, NULL);                                       \ 
  198      } else if (autotune_cdtp[LX] == 2 ) {                                     \ 
  199        CASE_KSTEP(LX, glb_cmd_queue, NULL);                                    \ 
  203#define CASE_LARGE(LX)                                                          \ 
  205      CASE_KSTEP(LX, glb_cmd_queue, NULL);                                      \ 
 
__global__ void const T *__restrict__ const T *__restrict__ dr
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
 
__global__ void const T *__restrict__ x
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void opencl_kernel_jit(const char *kernel, cl_program *program)
 
void opencl_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)