50#include "ax_helm_kernel.cl.h"
60 void *
g13,
void *
g23,
int *nelv,
int *lx) {
83#define CASE_1D(LX, QUEUE, EVENT) \
85 cl_kernel kernel = clCreateKernel(ax_helm_program, \
86 STR(ax_helm_kernel_lx##LX), &err); \
89 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w)); \
90 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
91 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx)); \
92 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy)); \
93 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz)); \
94 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
95 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
96 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
97 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &h1)); \
98 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g11)); \
99 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g22)); \
100 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g33)); \
101 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g12)); \
102 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g13)); \
103 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g23)); \
105 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
106 kernel, 1, NULL, &global_item_size, \
107 &local_item_size, 0, NULL, EVENT)); \
112#define CASE_KSTEP(LX, QUEUE, EVENT) \
114 cl_kernel kernel = clCreateKernel(ax_helm_program, \
115 STR(ax_helm_kernel_kstep_lx##LX), &err);\
118 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w)); \
119 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
120 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx)); \
121 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy)); \
122 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz)); \
123 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &h1)); \
124 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &g11)); \
125 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &g22)); \
126 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &g33)); \
127 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g12)); \
128 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g13)); \
129 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g23)); \
131 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
132 kernel, 2, NULL, global_kstep, \
133 local_kstep, 0, NULL, EVENT)); \
140 if(autotune_ax_helm[LX] == 0 ) { \
141 char *env_value = NULL; \
142 char neko_log_buf[80]; \
143 env_value = getenv("NEKO_AUTOTUNE"); \
145 sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx); \
146 log_section(neko_log_buf); \
148 if( !strcmp(env_value,"1D") ) { \
149 CASE_1D(LX, glb_cmd_queue, NULL); \
150 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
151 log_message(neko_log_buf); \
152 autotune_ax_helm[LX] = 1; \
153 } else if( !strcmp(env_value,"KSTEP") ) { \
154 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
155 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
156 log_message(neko_log_buf); \
157 autotune_ax_helm[LX] = 2; \
159 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
160 log_error(neko_log_buf); \
164 CL_CHECK(clFinish(glb_cmd_queue)); \
165 cl_event perf_event, sync_event; \
166 cl_ulong start, end; \
167 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
168 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
169 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
171 double elapsed1 = 0.0; \
172 for(int i = 0; i < 100; i++) { \
173 CASE_1D(LX, prf_cmd_queue, &perf_event); \
174 CL_CHECK(clWaitForEvents(1, &perf_event)); \
175 CL_CHECK(clGetEventProfilingInfo(perf_event, \
176 CL_PROFILING_COMMAND_START, \
177 sizeof(cl_ulong), &start, NULL)); \
178 CL_CHECK(clGetEventProfilingInfo(perf_event, \
179 CL_PROFILING_COMMAND_END, \
180 sizeof(cl_ulong), &end, NULL)); \
181 elapsed1 += (end - start)*1.0e-6; \
184 double elapsed2 = 0.0; \
185 for(int i = 0; i < 100; i++) { \
186 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
187 CL_CHECK(clWaitForEvents(1, &perf_event)); \
188 CL_CHECK(clGetEventProfilingInfo(perf_event, \
189 CL_PROFILING_COMMAND_START, \
190 sizeof(cl_ulong), &start, NULL)); \
191 CL_CHECK(clGetEventProfilingInfo(perf_event, \
192 CL_PROFILING_COMMAND_END, \
193 sizeof(cl_ulong), &end, NULL)); \
194 elapsed2 += (end - start)*1.0e-6; \
197 CL_CHECK(clFinish(prf_cmd_queue)); \
198 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
199 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
200 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
201 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
202 autotune_ax_helm[LX] = krnl_strtgy; \
203 log_message(neko_log_buf); \
204 clEnqueueBarrier(glb_cmd_queue); \
205 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
208 } else if (autotune_ax_helm[LX] == 1 ) { \
209 CASE_1D(LX, glb_cmd_queue, NULL); \
210 } else if (autotune_ax_helm[LX] == 2 ) { \
211 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
215#define CASE_LARGE(LX) \
217 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
259 void *
u,
void *
v,
void *
w,
260 void *
dx,
void *
dy,
void *
dz,
263 void *
g13,
void *
g23,
int *nelv,
int *lx) {
279#define CASE_VECTOR(LX) \
283 clCreateKernel(ax_helm_program, \
284 STR(ax_helm_kernel_vector_kstep_lx##LX), &err); \
287 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &au)); \
288 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &av)); \
289 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &aw)); \
290 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
291 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v)); \
292 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &w)); \
293 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dx)); \
294 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dy)); \
295 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dz)); \
296 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &h1)); \
297 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g11)); \
298 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g22)); \
299 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g33)); \
300 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g12)); \
301 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g13)); \
302 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &g23)); \
304 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, \
305 kernel, 2, NULL, global_kstep, \
306 local_kstep, 0, NULL, NULL)); \
void opencl_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
void opencl_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void opencl_kernel_jit(const char *kernel, cl_program *program)