50#include "cdtp_kernel.cl.h"
58 void *
dr,
void *
ds,
void *
dt,
60 void *
w3,
int *nel,
int *lx) {
82#define CASE_1D(LX, QUEUE, EVENT) \
84 cl_kernel kernel = clCreateKernel(cdtp_program, \
85 STR(cdtp_kernel_lx##LX), &err); \
88 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
89 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
90 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
91 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
92 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
93 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
94 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
95 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
96 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
98 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
99 kernel, 1, NULL, &global_item_size, \
100 &local_item_size, 0, NULL, EVENT)); \
101 CL_CHECK(clReleaseKernel(kernel)); \
105#define CASE_KSTEP(LX, QUEUE, EVENT) \
107 cl_kernel kernel = clCreateKernel(cdtp_program, \
108 STR(cdtp_kernel_kstep_lx##LX), &err); \
111 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
112 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
113 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
114 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
115 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
116 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
117 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
118 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
119 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
121 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
122 kernel, 2, NULL, global_kstep, \
123 local_kstep, 0, NULL, EVENT)); \
124 CL_CHECK(clReleaseKernel(kernel)); \
130 if(autotune_cdtp[LX] == 0 ) { \
131 char *env_value = NULL; \
132 char neko_log_buf[80]; \
133 env_value = getenv("NEKO_AUTOTUNE"); \
135 sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx); \
136 log_section(neko_log_buf); \
138 if( !strcmp(env_value,"1D") ) { \
139 CASE_1D(LX, glb_cmd_queue, NULL); \
140 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
141 log_message(neko_log_buf); \
142 autotune_cdtp[LX] = 1; \
143 } else if( !strcmp(env_value,"KSTEP") ) { \
144 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
145 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
146 log_message(neko_log_buf); \
147 autotune_cdtp[LX] = 2; \
149 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
150 log_error(neko_log_buf); \
154 CL_CHECK(clFinish(glb_cmd_queue)); \
155 cl_event perf_event, sync_event; \
156 cl_ulong start, end; \
157 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
158 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
159 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
161 double elapsed1 = 0.0; \
162 for(int i = 0; i < 100; i++) { \
163 CASE_1D(LX, prf_cmd_queue, &perf_event); \
164 CL_CHECK(clWaitForEvents(1, &perf_event)); \
165 CL_CHECK(clGetEventProfilingInfo(perf_event, \
166 CL_PROFILING_COMMAND_START, \
167 sizeof(cl_ulong), &start, NULL)); \
168 CL_CHECK(clGetEventProfilingInfo(perf_event, \
169 CL_PROFILING_COMMAND_END, \
170 sizeof(cl_ulong), &end, NULL)); \
171 elapsed1 += (end - start)*1.0e-6; \
174 double elapsed2 = 0.0; \
175 for(int i = 0; i < 100; i++) { \
176 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
177 CL_CHECK(clWaitForEvents(1, &perf_event)); \
178 CL_CHECK(clGetEventProfilingInfo(perf_event, \
179 CL_PROFILING_COMMAND_START, \
180 sizeof(cl_ulong), &start, NULL)); \
181 CL_CHECK(clGetEventProfilingInfo(perf_event, \
182 CL_PROFILING_COMMAND_END, \
183 sizeof(cl_ulong), &end, NULL)); \
184 elapsed2 += (end - start)*1.0e-6; \
187 CL_CHECK(clFinish(prf_cmd_queue)); \
188 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
189 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
190 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
191 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
192 autotune_cdtp[LX] = krnl_strtgy; \
193 log_message(neko_log_buf); \
194 clEnqueueBarrier(glb_cmd_queue); \
195 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
198 } else if (autotune_cdtp[LX] == 1 ) { \
199 CASE_1D(LX, glb_cmd_queue, NULL); \
200 } else if (autotune_cdtp[LX] == 2 ) { \
201 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
205#define CASE_LARGE(LX) \
207 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)