Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
opr_opgrad.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48#include <common/neko_log.h>
49
50#include "opgrad_kernel.cl.h"
51
53
57void opencl_opgrad(void *ux, void *uy, void *uz, void *u,
58 void *dx, void *dy, void *dz,
59 void *drdx, void *dsdx, void *dtdx,
60 void *drdy, void *dsdy, void *dtdy,
61 void *drdz, void *dsdz, void *dtdz,
62 void *w3, int *nel, int *lx) {
63 cl_int err;
64
65 if (opgrad_program == NULL)
67
68 const size_t global_item_size = 256 * (*nel);
69 const size_t local_item_size = 256;
70
71 size_t global_kstep[2];
72 size_t local_kstep[2];
73 local_kstep[0] = (*lx);
74 local_kstep[1] = (*lx);
75 global_kstep[0] = (*nel) * (*lx);
76 global_kstep[1] = (*lx);
77
78 if (autotune_opgrad == NULL) {
79 autotune_opgrad = malloc(17 * sizeof(int));
80 memset(autotune_opgrad, 0, 17 * sizeof(int));
81 }
82
83#define STR(X) #X
84#define CASE_1D(LX, QUEUE, EVENT) \
85 { \
86 cl_kernel kernel = clCreateKernel(opgrad_program, \
87 STR(opgrad_kernel_lx##LX), &err); \
88 CL_CHECK(err); \
89 \
90 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &ux)); \
91 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &uy)); \
92 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &uz)); \
93 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
94 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dx)); \
95 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dy)); \
96 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dz)); \
97 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &drdx)); \
98 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dsdx)); \
99 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dtdx)); \
100 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &drdy)); \
101 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &dsdy)); \
102 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dtdy)); \
103 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &drdz)); \
104 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &dsdz)); \
105 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dtdz)); \
106 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &w3)); \
107 \
108 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
109 kernel, 1, NULL, &global_item_size, \
110 &local_item_size,0, NULL, EVENT)); \
111 CL_CHECK(clReleaseKernel(kernel)); \
112 }
113
114#define CASE_KSTEP(LX, QUEUE, EVENT) \
115 { \
116 cl_kernel kernel = clCreateKernel(opgrad_program, \
117 STR(opgrad_kernel_kstep_lx##LX), &err); \
118 CL_CHECK(err); \
119 \
120 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &ux)); \
121 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &uy)); \
122 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &uz)); \
123 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u)); \
124 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dx)); \
125 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dy)); \
126 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dz)); \
127 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &drdx)); \
128 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &dsdx)); \
129 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &dtdx)); \
130 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *) &drdy)); \
131 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *) &dsdy)); \
132 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *) &dtdy)); \
133 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *) &drdz)); \
134 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_mem), (void *) &dsdz)); \
135 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_mem), (void *) &dtdz)); \
136 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_mem), (void *) &w3)); \
137 \
138 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
139 kernel, 2, NULL, global_kstep, \
140 local_kstep, 0, NULL, EVENT)); \
141 CL_CHECK(clReleaseKernel(kernel)); \
142 }
143
144
145#define CASE(LX) \
146 case LX: \
147 if(autotune_opgrad[LX] == 0 ) { \
148 char *env_value = NULL; \
149 char neko_log_buf[80]; \
150 env_value = getenv("NEKO_AUTOTUNE"); \
151 \
152 sprintf(neko_log_buf, "Autotune opgrad (lx: %d)", *lx); \
153 log_section(neko_log_buf); \
154 if(env_value) { \
155 if( !strcmp(env_value,"1D") ) { \
156 CASE_1D(LX, glb_cmd_queue, NULL); \
157 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
158 log_message(neko_log_buf); \
159 autotune_opgrad[LX] = 1; \
160 } else if( !strcmp(env_value,"KSTEP") ) { \
161 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
162 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
163 log_message(neko_log_buf); \
164 autotune_opgrad[LX] = 2; \
165 } else { \
166 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
167 log_error(neko_log_buf); \
168 } \
169 } \
170 else { \
171 CL_CHECK(clFinish(glb_cmd_queue)); \
172 cl_event perf_event, sync_event; \
173 cl_ulong start, end; \
174 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
175 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
176 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
177 \
178 double elapsed1 = 0.0; \
179 for(int i = 0; i < 100; i++) { \
180 CASE_1D(LX, prf_cmd_queue, &perf_event); \
181 CL_CHECK(clWaitForEvents(1, &perf_event)); \
182 CL_CHECK(clGetEventProfilingInfo(perf_event, \
183 CL_PROFILING_COMMAND_START, \
184 sizeof(cl_ulong), &start, NULL)); \
185 CL_CHECK(clGetEventProfilingInfo(perf_event, \
186 CL_PROFILING_COMMAND_END, \
187 sizeof(cl_ulong), &end, NULL)); \
188 elapsed1 += (end - start)*1.0e-6; \
189 } \
190 \
191 double elapsed2 = 0.0; \
192 for(int i = 0; i < 100; i++) { \
193 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
194 CL_CHECK(clWaitForEvents(1, &perf_event)); \
195 CL_CHECK(clGetEventProfilingInfo(perf_event, \
196 CL_PROFILING_COMMAND_START, \
197 sizeof(cl_ulong), &start, NULL)); \
198 CL_CHECK(clGetEventProfilingInfo(perf_event, \
199 CL_PROFILING_COMMAND_END, \
200 sizeof(cl_ulong), &end, NULL)); \
201 elapsed2 += (end - start)*1.0e-6; \
202 } \
203 \
204 CL_CHECK(clFinish(prf_cmd_queue)); \
205 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
206 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
207 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
208 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
209 autotune_opgrad[LX] = krnl_strtgy; \
210 log_message(neko_log_buf); \
211 clEnqueueBarrier(glb_cmd_queue); \
212 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
213 } \
214 log_end_section(); \
215 } else if (autotune_opgrad[LX] == 1 ) { \
216 CASE_1D(LX, glb_cmd_queue, NULL); \
217 } else if (autotune_opgrad[LX] == 2 ) { \
218 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
219 } \
220 break
221
222 switch(*lx) {
223 CASE(2);
224 CASE(3);
225 CASE(4);
226 CASE(5);
227 CASE(6);
228 CASE(7);
229 CASE(8);
230 CASE(9);
231 CASE(10);
232 CASE(11);
233 CASE(12);
234 CASE(13);
235 CASE(14);
236 CASE(15);
237 CASE(16);
238 default:
239 {
240 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
241 exit(1);
242 }
243 }
244}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void T *__restrict__ uy
__global__ void T *__restrict__ T *__restrict__ uz
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
int * autotune_opgrad
Definition opr_opgrad.c:52
#define CASE(LX)
void opencl_opgrad(void *ux, void *uy, void *uz, void *u, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *w3, int *nel, int *lx)
Definition opr_opgrad.c:57
void * opgrad_program