Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
ax_helm.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdlib.h>
42#include <stdio.h>
43#include <string.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48#include <common/neko_log.h>
49
50#include "ax_helm_kernel.cl.h"
51
53
57void opencl_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
58 void *dxt, void *dyt, void *dzt, void *h1,
59 void *g11, void *g22, void *g33, void *g12,
60 void *g13, void *g23, int *nelv, int *lx) {
61
62 cl_int err;
63
64 if (ax_helm_program == NULL)
66
67 const size_t global_item_size = 256 * (*nelv);
68 const size_t local_item_size = 256;
69
70 size_t global_kstep[2];
71 size_t local_kstep[2];
72 local_kstep[0] = (*lx);
73 local_kstep[1] = (*lx);
74 global_kstep[0] = (*nelv) * (*lx);
75 global_kstep[1] = (*lx);
76
77 if (autotune_ax_helm == NULL) {
78 autotune_ax_helm = malloc(13 * sizeof(int));
79 memset(autotune_ax_helm, 0, 13 * sizeof(int));
80 }
81
82#define STR(X) #X
83#define CASE_1D(LX, QUEUE, EVENT) \
84 { \
85 cl_kernel kernel = clCreateKernel(ax_helm_program, \
86 STR(ax_helm_kernel_lx##LX), &err); \
87 CL_CHECK(err); \
88 \
89 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w)); \
90 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
91 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx)); \
92 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy)); \
93 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz)); \
94 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
95 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
96 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
97 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &h1)); \
98 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g11)); \
99 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g22)); \
100 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g33)); \
101 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g12)); \
102 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g13)); \
103 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g23)); \
104 \
105 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
106 kernel, 1, NULL, &global_item_size, \
107 &local_item_size, 0, NULL, EVENT)); \
108 \
109 }
110
111
112#define CASE_KSTEP(LX, QUEUE, EVENT) \
113 { \
114 cl_kernel kernel = clCreateKernel(ax_helm_program, \
115 STR(ax_helm_kernel_kstep_lx##LX), &err);\
116 CL_CHECK(err); \
117 \
118 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w)); \
119 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u)); \
120 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dx)); \
121 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &dy)); \
122 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dz)); \
123 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &h1)); \
124 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &g11)); \
125 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &g22)); \
126 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &g33)); \
127 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &g12)); \
128 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g13)); \
129 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g23)); \
130 \
131 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
132 kernel, 2, NULL, global_kstep, \
133 local_kstep, 0, NULL, EVENT)); \
134 \
135 }
136
137
138#define CASE(LX) \
139 case LX: \
140 if(autotune_ax_helm[LX] == 0 ) { \
141 char *env_value = NULL; \
142 char neko_log_buf[80]; \
143 env_value = getenv("NEKO_AUTOTUNE"); \
144 \
145 sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx); \
146 log_section(neko_log_buf); \
147 if(env_value) { \
148 if( !strcmp(env_value,"1D") ) { \
149 CASE_1D(LX, glb_cmd_queue, NULL); \
150 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
151 log_message(neko_log_buf); \
152 autotune_ax_helm[LX] = 1; \
153 } else if( !strcmp(env_value,"KSTEP") ) { \
154 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
155 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
156 log_message(neko_log_buf); \
157 autotune_ax_helm[LX] = 2; \
158 } else { \
159 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
160 log_error(neko_log_buf); \
161 } \
162 } \
163 else { \
164 CL_CHECK(clFinish(glb_cmd_queue)); \
165 cl_event perf_event, sync_event; \
166 cl_ulong start, end; \
167 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
168 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
169 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
170 \
171 double elapsed1 = 0.0; \
172 for(int i = 0; i < 100; i++) { \
173 CASE_1D(LX, prf_cmd_queue, &perf_event); \
174 CL_CHECK(clWaitForEvents(1, &perf_event)); \
175 CL_CHECK(clGetEventProfilingInfo(perf_event, \
176 CL_PROFILING_COMMAND_START, \
177 sizeof(cl_ulong), &start, NULL)); \
178 CL_CHECK(clGetEventProfilingInfo(perf_event, \
179 CL_PROFILING_COMMAND_END, \
180 sizeof(cl_ulong), &end, NULL)); \
181 elapsed1 += (end - start)*1.0e-6; \
182 } \
183 \
184 double elapsed2 = 0.0; \
185 for(int i = 0; i < 100; i++) { \
186 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
187 CL_CHECK(clWaitForEvents(1, &perf_event)); \
188 CL_CHECK(clGetEventProfilingInfo(perf_event, \
189 CL_PROFILING_COMMAND_START, \
190 sizeof(cl_ulong), &start, NULL)); \
191 CL_CHECK(clGetEventProfilingInfo(perf_event, \
192 CL_PROFILING_COMMAND_END, \
193 sizeof(cl_ulong), &end, NULL)); \
194 elapsed2 += (end - start)*1.0e-6; \
195 } \
196 \
197 CL_CHECK(clFinish(prf_cmd_queue)); \
198 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
199 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
200 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
201 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
202 autotune_ax_helm[LX] = krnl_strtgy; \
203 log_message(neko_log_buf); \
204 clEnqueueBarrier(glb_cmd_queue); \
205 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
206 } \
207 log_end_section(); \
208 } else if (autotune_ax_helm[LX] == 1 ) { \
209 CASE_1D(LX, glb_cmd_queue, NULL); \
210 } else if (autotune_ax_helm[LX] == 2 ) { \
211 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
212 } \
213 break
214
215#define CASE_LARGE(LX) \
216 case LX: \
217 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
218 break
219
220 if((*lx) < 13) {
221 switch(*lx) {
222 CASE(2);
223 CASE(3);
224 CASE(4);
225 CASE(5);
226 CASE(6);
227 CASE(7);
228 CASE(8);
229 CASE(9);
230 CASE(10);
231 CASE(11);
232 CASE(12);
233 default:
234 {
235 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
236 exit(1);
237 }
238 }
239 }
240 else {
241 switch(*lx) {
242 CASE_LARGE(13);
243 CASE_LARGE(14);
244 CASE_LARGE(15);
245 CASE_LARGE(16);
246 default:
247 {
248 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
249 exit(1);
250 }
251 }
252 }
253}
254
258void opencl_ax_helm_vector(void *au, void *av, void *aw,
259 void *u, void *v, void *w,
260 void *dx, void *dy, void *dz,
261 void *dxt, void *dyt, void *dzt, void *h1,
262 void *g11, void *g22, void *g33, void *g12,
263 void *g13, void *g23, int *nelv, int *lx) {
264
265 cl_int err;
266
267 if (ax_helm_program == NULL)
269
270 size_t * global_kstep = (size_t *) malloc(sizeof(size_t) * 2);
271 size_t * local_kstep = (size_t *) malloc(sizeof(size_t) * 2);
272 local_kstep[0] = (*lx);
273 local_kstep[1] = (*lx);
274 global_kstep[0] = (*nelv) * (*lx);
275 global_kstep[1] = (*lx);
276
277
278#define STR(X) #X
279#define CASE_VECTOR(LX) \
280 case LX: \
281 { \
282 cl_kernel kernel = \
283 clCreateKernel(ax_helm_program, \
284 STR(ax_helm_kernel_vector_kstep_lx##LX), &err); \
285 CL_CHECK(err); \
286 \
287 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &au)); \
288 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &av)); \
289 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &aw)); \
290 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
291 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v)); \
292 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &w)); \
293 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dx)); \
294 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dy)); \
295 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dz)); \
296 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &h1)); \
297 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &g11)); \
298 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &g22)); \
299 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &g33)); \
300 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &g12)); \
301 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &g13)); \
302 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &g23)); \
303 \
304 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, \
305 kernel, 2, NULL, global_kstep, \
306 local_kstep, 0, NULL, NULL)); \
307 \
308 } \
309 break
310
311 switch(*lx) {
312 CASE_VECTOR(2);
313 CASE_VECTOR(3);
314 CASE_VECTOR(4);
315 CASE_VECTOR(5);
316 CASE_VECTOR(6);
317 CASE_VECTOR(7);
318 CASE_VECTOR(8);
319 CASE_VECTOR(9);
320 CASE_VECTOR(10);
321 CASE_VECTOR(11);
322 CASE_VECTOR(12);
323 CASE_VECTOR(13);
324 CASE_VECTOR(14);
325 CASE_VECTOR(15);
326 CASE_VECTOR(16);
327 default:
328 {
329 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
330 exit(1);
331 }
332 }
333
334 free(global_kstep);
335 free(local_kstep);
336}
void opencl_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition ax_helm.c:258
#define CASE_VECTOR(LX)
#define CASE(LX)
void opencl_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition ax_helm.c:57
int * autotune_ax_helm
Definition ax_helm.c:52
#define CASE_LARGE(LX)
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void * ax_helm_program