50#include "conv1_kernel.cl.h"
58 void *
vx,
void *
vy,
void *
vz,
59 void *
dx,
void *
dy,
void *
dz,
63 void *
jacinv,
int *nel,
int *gdim,
int *lx) {
85#define CASE_1D(LX, QUEUE, EVENT) \
87 cl_kernel kernel = clCreateKernel(conv1_program, \
88 STR(conv1_kernel_lx##LX), &err); \
91 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du)); \
92 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
93 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &vx)); \
94 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &vy)); \
95 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &vz)); \
96 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx)); \
97 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy)); \
98 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz)); \
99 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &drdx)); \
100 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dsdx)); \
101 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &dtdx)); \
102 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &drdy)); \
103 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dsdy)); \
104 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &dtdy)); \
105 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &drdz)); \
106 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dsdz)); \
107 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &dtdz)); \
108 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_mem), (void *) &jacinv)); \
110 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
111 kernel, 1, NULL, &global_item_size, \
112 &local_item_size, 0, NULL, EVENT)); \
115#define CASE_KSTEP(LX, QUEUE, EVENT) \
117 cl_kernel kernel = clCreateKernel(conv1_program, \
118 STR(conv1_kernel_kstep_lx##LX), &err); \
121 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du)); \
122 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
123 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &vx)); \
124 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &vy)); \
125 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &vz)); \
126 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx)); \
127 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy)); \
128 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz)); \
129 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &drdx)); \
130 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dsdx)); \
131 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &dtdx)); \
132 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &drdy)); \
133 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dsdy)); \
134 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &dtdy)); \
135 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &drdz)); \
136 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dsdz)); \
137 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &dtdz)); \
138 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_mem), (void *) &jacinv)); \
140 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
141 kernel, 2, NULL, global_kstep, \
142 local_kstep, 0, NULL, EVENT)); \
148 if(autotune_conv1[LX] == 0 ) { \
149 char *env_value = NULL; \
150 char neko_log_buf[80]; \
151 env_value = getenv("NEKO_AUTOTUNE"); \
153 sprintf(neko_log_buf, "Autotune conv1 (lx: %d)", *lx); \
154 log_section(neko_log_buf); \
156 if( !strcmp(env_value,"1D") ) { \
157 CASE_1D(LX, glb_cmd_queue, NULL); \
158 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
159 log_message(neko_log_buf); \
160 autotune_conv1[LX] = 1; \
161 } else if( !strcmp(env_value,"KSTEP") ) { \
162 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
163 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
164 log_message(neko_log_buf); \
165 autotune_conv1[LX] = 2; \
167 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
168 log_error(neko_log_buf); \
172 CL_CHECK(clFinish(glb_cmd_queue)); \
173 cl_event perf_event, sync_event; \
174 cl_ulong start, end; \
175 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
176 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
177 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
179 double elapsed1 = 0.0; \
180 for(int i = 0; i < 100; i++) { \
181 CASE_1D(LX, prf_cmd_queue, &perf_event); \
182 CL_CHECK(clWaitForEvents(1, &perf_event)); \
183 CL_CHECK(clGetEventProfilingInfo(perf_event, \
184 CL_PROFILING_COMMAND_START, \
185 sizeof(cl_ulong), &start, NULL)); \
186 CL_CHECK(clGetEventProfilingInfo(perf_event, \
187 CL_PROFILING_COMMAND_END, \
188 sizeof(cl_ulong), &end, NULL)); \
189 elapsed1 += (end - start)*1.0e-6; \
192 double elapsed2 = 0.0; \
193 for(int i = 0; i < 100; i++) { \
194 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
195 CL_CHECK(clWaitForEvents(1, &perf_event)); \
196 CL_CHECK(clGetEventProfilingInfo(perf_event, \
197 CL_PROFILING_COMMAND_START, \
198 sizeof(cl_ulong), &start, NULL)); \
199 CL_CHECK(clGetEventProfilingInfo(perf_event, \
200 CL_PROFILING_COMMAND_END, \
201 sizeof(cl_ulong), &end, NULL)); \
202 elapsed2 += (end - start)*1.0e-6; \
205 CL_CHECK(clFinish(prf_cmd_queue)); \
206 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
207 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
208 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
209 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
210 autotune_conv1[LX] = krnl_strtgy; \
211 log_message(neko_log_buf); \
212 clEnqueueBarrier(glb_cmd_queue); \
213 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
216 } else if (autotune_conv1[LX] == 1 ) { \
217 CASE_1D(LX, glb_cmd_queue, NULL); \
218 } else if (autotune_conv1[LX] == 2 ) { \
219 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
223#define CASE_LARGE(LX) \
225 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ vz
__global__ void const T *__restrict__ const T *__restrict__ vx
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ vy
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_conv1(void *du, void *u, void *vx, void *vy, void *vz, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *gdim, int *lx)