Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
opr_opgrad.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48#include <common/neko_log.h>
49
50#include "opgrad_kernel.cl.h"
51
53
57void opencl_opgrad(void *ux, void *uy, void *uz, void *u,
58 void *dx, void *dy, void *dz,
59 void *drdx, void *dsdx, void *dtdx,
60 void *drdy, void *dsdy, void *dtdy,
61 void *drdz, void *dsdz, void *dtdz,
62 void *w3, int *nel, int *lx) {
63 cl_int err;
64
65 if (opgrad_program == NULL)
67
68 const size_t global_item_size = 256 * (*nel);
69 const size_t local_item_size = 256;
70
71 size_t global_kstep[2];
72 size_t local_kstep[2];
73 local_kstep[0] = (*lx);
74 local_kstep[1] = (*lx);
75 global_kstep[0] = (*nel) * (*lx);
76 global_kstep[1] = (*lx);
77
78 if (autotune_opgrad == NULL) {
79 autotune_opgrad = malloc(17 * sizeof(int));
80 memset(autotune_opgrad, 0, 17 * sizeof(int));
81 }
82
83#define STR(X) #X
84#define CASE_1D(LX, QUEUE, EVENT) \
85 { \
86 cl_kernel kernel = clCreateKernel(opgrad_program, \
87 STR(opgrad_kernel_lx##LX), &err); \
88 CL_CHECK(err); \
89 \
90 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &ux)); \
91 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &uy)); \
92 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &uz)); \
93 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
94 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dx)); \
95 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dy)); \
96 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dz)); \
97 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &drdx)); \
98 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dsdx)); \
99 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dtdx)); \
100 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &drdy)); \
101 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &dsdy)); \
102 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dtdy)); \
103 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &drdz)); \
104 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &dsdz)); \
105 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dtdz)); \
106 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &w3)); \
107 \
108 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
109 kernel, 1, NULL, &global_item_size, \
110 &local_item_size,0, NULL, EVENT)); \
111 }
112
113#define CASE_KSTEP(LX, QUEUE, EVENT) \
114 { \
115 cl_kernel kernel = clCreateKernel(opgrad_program, \
116 STR(opgrad_kernel_kstep_lx##LX), &err); \
117 CL_CHECK(err); \
118 \
119 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &ux)); \
120 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &uy)); \
121 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &uz)); \
122 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
123 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dx)); \
124 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dy)); \
125 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dz)); \
126 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &drdx)); \
127 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dsdx)); \
128 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dtdx)); \
129 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &drdy)); \
130 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &dsdy)); \
131 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dtdy)); \
132 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &drdz)); \
133 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &dsdz)); \
134 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dtdz)); \
135 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &w3)); \
136 \
137 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
138 kernel, 2, NULL, global_kstep, \
139 local_kstep, 0, NULL, EVENT)); \
140 }
141
142
143#define CASE(LX) \
144 case LX: \
145 if(autotune_opgrad[LX] == 0 ) { \
146 char *env_value = NULL; \
147 char neko_log_buf[80]; \
148 env_value = getenv("NEKO_AUTOTUNE"); \
149 \
150 sprintf(neko_log_buf, "Autotune opgrad (lx: %d)", *lx); \
151 log_section(neko_log_buf); \
152 if(env_value) { \
153 if( !strcmp(env_value,"1D") ) { \
154 CASE_1D(LX, glb_cmd_queue, NULL); \
155 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
156 log_message(neko_log_buf); \
157 autotune_opgrad[LX] = 1; \
158 } else if( !strcmp(env_value,"KSTEP") ) { \
159 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
160 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
161 log_message(neko_log_buf); \
162 autotune_opgrad[LX] = 2; \
163 } else { \
164 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
165 log_error(neko_log_buf); \
166 } \
167 } \
168 else { \
169 CL_CHECK(clFinish(glb_cmd_queue)); \
170 cl_event perf_event, sync_event; \
171 cl_ulong start, end; \
172 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
173 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
174 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
175 \
176 double elapsed1 = 0.0; \
177 for(int i = 0; i < 100; i++) { \
178 CASE_1D(LX, prf_cmd_queue, &perf_event); \
179 CL_CHECK(clWaitForEvents(1, &perf_event)); \
180 CL_CHECK(clGetEventProfilingInfo(perf_event, \
181 CL_PROFILING_COMMAND_START, \
182 sizeof(cl_ulong), &start, NULL)); \
183 CL_CHECK(clGetEventProfilingInfo(perf_event, \
184 CL_PROFILING_COMMAND_END, \
185 sizeof(cl_ulong), &end, NULL)); \
186 elapsed1 += (end - start)*1.0e-6; \
187 } \
188 \
189 double elapsed2 = 0.0; \
190 for(int i = 0; i < 100; i++) { \
191 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
192 CL_CHECK(clWaitForEvents(1, &perf_event)); \
193 CL_CHECK(clGetEventProfilingInfo(perf_event, \
194 CL_PROFILING_COMMAND_START, \
195 sizeof(cl_ulong), &start, NULL)); \
196 CL_CHECK(clGetEventProfilingInfo(perf_event, \
197 CL_PROFILING_COMMAND_END, \
198 sizeof(cl_ulong), &end, NULL)); \
199 elapsed2 += (end - start)*1.0e-6; \
200 } \
201 \
202 CL_CHECK(clFinish(prf_cmd_queue)); \
203 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
204 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
205 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
206 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
207 autotune_opgrad[LX] = krnl_strtgy; \
208 log_message(neko_log_buf); \
209 clEnqueueBarrier(glb_cmd_queue); \
210 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
211 } \
212 log_end_section(); \
213 } else if (autotune_opgrad[LX] == 1 ) { \
214 CASE_1D(LX, glb_cmd_queue, NULL); \
215 } else if (autotune_opgrad[LX] == 2 ) { \
216 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
217 } \
218 break
219
220 switch(*lx) {
221 CASE(2);
222 CASE(3);
223 CASE(4);
224 CASE(5);
225 CASE(6);
226 CASE(7);
227 CASE(8);
228 CASE(9);
229 CASE(10);
230 CASE(11);
231 CASE(12);
232 CASE(13);
233 CASE(14);
234 CASE(15);
235 CASE(16);
236 default:
237 {
238 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
239 exit(1);
240 }
241 }
242}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void T *__restrict__ uy
__global__ void T *__restrict__ T *__restrict__ uz
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
int * autotune_opgrad
Definition opr_opgrad.c:52
#define CASE(LX)
void opencl_opgrad(void *ux, void *uy, void *uz, void *u, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *w3, int *nel, int *lx)
Definition opr_opgrad.c:57
void * opgrad_program