50#include "dudxyz_kernel.cl.h"
58 void *
dr,
void *
ds,
void *
dt,
59 void *
dx,
void *
dy,
void *
dz,
60 void *
jacinv,
int *nel,
int *lx) {
82#define CASE_1D(LX, QUEUE, EVENT) \
84 cl_kernel kernel = clCreateKernel(dudxyz_program, \
85 STR(dudxyz_kernel_lx##LX), &err); \
88 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du)); \
89 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
90 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
91 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
92 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
93 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx)); \
94 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy)); \
95 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz)); \
96 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &jacinv)); \
98 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
99 kernel, 1, NULL, &global_item_size, \
100 &local_item_size, 0, NULL, EVENT)); \
103#define CASE_KSTEP(LX, QUEUE, EVENT) \
105 cl_kernel kernel = clCreateKernel(dudxyz_program, \
106 STR(dudxyz_kernel_kstep_lx##LX), &err); \
109 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &du)); \
110 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
111 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
112 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
113 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
114 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dx)); \
115 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dy)); \
116 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dz)); \
117 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &jacinv)); \
119 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
120 kernel, 2, NULL, global_kstep, \
121 local_kstep, 0, NULL, EVENT)); \
126 if(autotune_dudxyz[LX] == 0 ) { \
127 char *env_value = NULL; \
128 char neko_log_buf[80]; \
129 env_value = getenv("NEKO_AUTOTUNE"); \
131 sprintf(neko_log_buf, "Autotune dudxyz (lx: %d)", *lx); \
132 log_section(neko_log_buf); \
134 if( !strcmp(env_value,"1D") ) { \
135 CASE_1D(LX, glb_cmd_queue, NULL); \
136 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
137 log_message(neko_log_buf); \
138 autotune_dudxyz[LX] = 1; \
139 } else if( !strcmp(env_value,"KSTEP") ) { \
140 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
141 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
142 log_message(neko_log_buf); \
143 autotune_dudxyz[LX] = 2; \
145 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
146 log_error(neko_log_buf); \
150 CL_CHECK(clFinish(glb_cmd_queue)); \
151 cl_event perf_event, sync_event; \
152 cl_ulong start, end; \
153 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
154 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
155 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
157 double elapsed1 = 0.0; \
158 for(int i = 0; i < 100; i++) { \
159 CASE_1D(LX, prf_cmd_queue, &perf_event); \
160 CL_CHECK(clWaitForEvents(1, &perf_event)); \
161 CL_CHECK(clGetEventProfilingInfo(perf_event, \
162 CL_PROFILING_COMMAND_START, \
163 sizeof(cl_ulong), &start, NULL)); \
164 CL_CHECK(clGetEventProfilingInfo(perf_event, \
165 CL_PROFILING_COMMAND_END, \
166 sizeof(cl_ulong), &end, NULL)); \
167 elapsed1 += (end - start)*1.0e-6; \
170 double elapsed2 = 0.0; \
171 for(int i = 0; i < 100; i++) { \
172 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
173 CL_CHECK(clWaitForEvents(1, &perf_event)); \
174 CL_CHECK(clGetEventProfilingInfo(perf_event, \
175 CL_PROFILING_COMMAND_START, \
176 sizeof(cl_ulong), &start, NULL)); \
177 CL_CHECK(clGetEventProfilingInfo(perf_event, \
178 CL_PROFILING_COMMAND_END, \
179 sizeof(cl_ulong), &end, NULL)); \
180 elapsed2 += (end - start)*1.0e-6; \
183 CL_CHECK(clFinish(prf_cmd_queue)); \
184 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
185 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
186 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
187 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
188 autotune_dudxyz[LX] = krnl_strtgy; \
189 log_message(neko_log_buf); \
190 clEnqueueBarrier(glb_cmd_queue); \
191 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
194 } else if (autotune_dudxyz[LX] == 1 ) { \
195 CASE_1D(LX, glb_cmd_queue, NULL); \
196 } else if (autotune_dudxyz[LX] == 2 ) { \
197 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
201#define CASE_LARGE(LX) \
203 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_dudxyz(void *du, void *u, void *dr, void *ds, void *dt, void *dx, void *dy, void *dz, void *jacinv, int *nel, int *lx)