50#include "cdtp_kernel.cl.h"
58 void *
dr,
void *
ds,
void *
dt,
60 void *
w3,
int *nel,
int *lx) {
82#define CASE_1D(LX, QUEUE, EVENT) \
84 cl_kernel kernel = clCreateKernel(cdtp_program, \
85 STR(cdtp_kernel_lx##LX), &err); \
88 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
89 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
90 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
91 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
92 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
93 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
94 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
95 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
96 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
98 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
99 kernel, 1, NULL, &global_item_size, \
100 &local_item_size, 0, NULL, EVENT)); \
104#define CASE_KSTEP(LX, QUEUE, EVENT) \
106 cl_kernel kernel = clCreateKernel(cdtp_program, \
107 STR(cdtp_kernel_kstep_lx##LX), &err); \
110 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
111 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
112 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
113 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
114 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
115 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
116 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
117 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
118 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
120 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
121 kernel, 2, NULL, global_kstep, \
122 local_kstep, 0, NULL, EVENT)); \
128 if(autotune_cdtp[LX] == 0 ) { \
129 char *env_value = NULL; \
130 char neko_log_buf[80]; \
131 env_value = getenv("NEKO_AUTOTUNE"); \
133 sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx); \
134 log_section(neko_log_buf); \
136 if( !strcmp(env_value,"1D") ) { \
137 CASE_1D(LX, glb_cmd_queue, NULL); \
138 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
139 log_message(neko_log_buf); \
140 autotune_cdtp[LX] = 1; \
141 } else if( !strcmp(env_value,"KSTEP") ) { \
142 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
143 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
144 log_message(neko_log_buf); \
145 autotune_cdtp[LX] = 2; \
147 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
148 log_error(neko_log_buf); \
152 CL_CHECK(clFinish(glb_cmd_queue)); \
153 cl_event perf_event, sync_event; \
154 cl_ulong start, end; \
155 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
156 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
157 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
159 double elapsed1 = 0.0; \
160 for(int i = 0; i < 100; i++) { \
161 CASE_1D(LX, prf_cmd_queue, &perf_event); \
162 CL_CHECK(clWaitForEvents(1, &perf_event)); \
163 CL_CHECK(clGetEventProfilingInfo(perf_event, \
164 CL_PROFILING_COMMAND_START, \
165 sizeof(cl_ulong), &start, NULL)); \
166 CL_CHECK(clGetEventProfilingInfo(perf_event, \
167 CL_PROFILING_COMMAND_END, \
168 sizeof(cl_ulong), &end, NULL)); \
169 elapsed1 += (end - start)*1.0e-6; \
172 double elapsed2 = 0.0; \
173 for(int i = 0; i < 100; i++) { \
174 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
175 CL_CHECK(clWaitForEvents(1, &perf_event)); \
176 CL_CHECK(clGetEventProfilingInfo(perf_event, \
177 CL_PROFILING_COMMAND_START, \
178 sizeof(cl_ulong), &start, NULL)); \
179 CL_CHECK(clGetEventProfilingInfo(perf_event, \
180 CL_PROFILING_COMMAND_END, \
181 sizeof(cl_ulong), &end, NULL)); \
182 elapsed2 += (end - start)*1.0e-6; \
185 CL_CHECK(clFinish(prf_cmd_queue)); \
186 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
187 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
188 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
189 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
190 autotune_cdtp[LX] = krnl_strtgy; \
191 log_message(neko_log_buf); \
192 clEnqueueBarrier(glb_cmd_queue); \
193 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
196 } else if (autotune_cdtp[LX] == 1 ) { \
197 CASE_1D(LX, glb_cmd_queue, NULL); \
198 } else if (autotune_cdtp[LX] == 2 ) { \
199 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
203#define CASE_LARGE(LX) \
205 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)