Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
opr_cdtp.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdlib.h>
42#include <stdio.h>
43#include <string.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48#include <common/neko_log.h>
49
50#include "cdtp_kernel.cl.h"
51
53
57void opencl_cdtp(void *dtx, void *x,
58 void *dr, void *ds, void *dt,
59 void *dxt, void *dyt, void *dzt,
60 void *w3, int *nel, int *lx) {
61 cl_int err;
62
63 if (cdtp_program == NULL)
65
66 const size_t global_item_size = 256 * (*nel);
67 const size_t local_item_size = 256;
68
69 size_t global_kstep[2];
70 size_t local_kstep[2];
71 local_kstep[0] = (*lx);
72 local_kstep[1] = (*lx);
73 global_kstep[0] = (*nel) * (*lx);
74 global_kstep[1] = (*lx);
75
76 if (autotune_cdtp == NULL) {
77 autotune_cdtp = malloc(17 * sizeof(int));
78 memset(autotune_cdtp, 0, 17 * sizeof(int));
79 }
80
81#define STR(X) #X
82#define CASE_1D(LX, QUEUE, EVENT) \
83 { \
84 cl_kernel kernel = clCreateKernel(cdtp_program, \
85 STR(cdtp_kernel_lx##LX), &err); \
86 CL_CHECK(err); \
87 \
88 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
89 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
90 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
91 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
92 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
93 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
94 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
95 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
96 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
97 \
98 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
99 kernel, 1, NULL, &global_item_size, \
100 &local_item_size, 0, NULL, EVENT)); \
101 CL_CHECK(clReleaseKernel(kernel)); \
102 }
103
104
105#define CASE_KSTEP(LX, QUEUE, EVENT) \
106 { \
107 cl_kernel kernel = clCreateKernel(cdtp_program, \
108 STR(cdtp_kernel_kstep_lx##LX), &err); \
109 CL_CHECK(err); \
110 \
111 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
112 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
113 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
114 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
115 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
116 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
117 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
118 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
119 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
120 \
121 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
122 kernel, 2, NULL, global_kstep, \
123 local_kstep, 0, NULL, EVENT)); \
124 CL_CHECK(clReleaseKernel(kernel)); \
125 }
126
127
128#define CASE(LX) \
129 case LX: \
130 if(autotune_cdtp[LX] == 0 ) { \
131 char *env_value = NULL; \
132 char neko_log_buf[80]; \
133 env_value = getenv("NEKO_AUTOTUNE"); \
134 \
135 sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx); \
136 log_section(neko_log_buf); \
137 if(env_value) { \
138 if( !strcmp(env_value,"1D") ) { \
139 CASE_1D(LX, glb_cmd_queue, NULL); \
140 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
141 log_message(neko_log_buf); \
142 autotune_cdtp[LX] = 1; \
143 } else if( !strcmp(env_value,"KSTEP") ) { \
144 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
145 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
146 log_message(neko_log_buf); \
147 autotune_cdtp[LX] = 2; \
148 } else { \
149 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
150 log_error(neko_log_buf); \
151 } \
152 } \
153 else { \
154 CL_CHECK(clFinish(glb_cmd_queue)); \
155 cl_event perf_event, sync_event; \
156 cl_ulong start, end; \
157 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
158 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
159 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
160 \
161 double elapsed1 = 0.0; \
162 for(int i = 0; i < 100; i++) { \
163 CASE_1D(LX, prf_cmd_queue, &perf_event); \
164 CL_CHECK(clWaitForEvents(1, &perf_event)); \
165 CL_CHECK(clGetEventProfilingInfo(perf_event, \
166 CL_PROFILING_COMMAND_START, \
167 sizeof(cl_ulong), &start, NULL)); \
168 CL_CHECK(clGetEventProfilingInfo(perf_event, \
169 CL_PROFILING_COMMAND_END, \
170 sizeof(cl_ulong), &end, NULL)); \
171 elapsed1 += (end - start)*1.0e-6; \
172 } \
173 \
174 double elapsed2 = 0.0; \
175 for(int i = 0; i < 100; i++) { \
176 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
177 CL_CHECK(clWaitForEvents(1, &perf_event)); \
178 CL_CHECK(clGetEventProfilingInfo(perf_event, \
179 CL_PROFILING_COMMAND_START, \
180 sizeof(cl_ulong), &start, NULL)); \
181 CL_CHECK(clGetEventProfilingInfo(perf_event, \
182 CL_PROFILING_COMMAND_END, \
183 sizeof(cl_ulong), &end, NULL)); \
184 elapsed2 += (end - start)*1.0e-6; \
185 } \
186 \
187 CL_CHECK(clFinish(prf_cmd_queue)); \
188 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
189 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
190 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
191 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
192 autotune_cdtp[LX] = krnl_strtgy; \
193 log_message(neko_log_buf); \
194 clEnqueueBarrier(glb_cmd_queue); \
195 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
196 } \
197 log_end_section(); \
198 } else if (autotune_cdtp[LX] == 1 ) { \
199 CASE_1D(LX, glb_cmd_queue, NULL); \
200 } else if (autotune_cdtp[LX] == 2 ) { \
201 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
202 } \
203 break
204
205#define CASE_LARGE(LX) \
206 case LX: \
207 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
208 break
209
210 if ((*lx) < 14) {
211 switch(*lx) {
212 CASE(2);
213 CASE(3);
214 CASE(4);
215 CASE(5);
216 CASE(6);
217 CASE(7);
218 CASE(8);
219 CASE(9);
220 CASE(10);
221 CASE(11);
222 CASE(12);
223 CASE(13);
224 default:
225 {
226 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
227 exit(1);
228 }
229 }
230 }
231 else {
232 switch(*lx) {
233 CASE_LARGE(14);
234 CASE_LARGE(15);
235 CASE_LARGE(16);
236 default:
237 {
238 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
239 exit(1);
240 }
241 }
242 }
243}
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
Definition opr_cdtp.c:57
#define CASE(LX)
int * autotune_cdtp
Definition opr_cdtp.c:52
#define CASE_LARGE(LX)
void * cdtp_program