Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
opr_cdtp.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdlib.h>
42#include <stdio.h>
43#include <string.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48#include <common/neko_log.h>
49
50#include "cdtp_kernel.cl.h"
51
53
57void opencl_cdtp(void *dtx, void *x,
58 void *dr, void *ds, void *dt,
59 void *dxt, void *dyt, void *dzt,
60 void *w3, int *nel, int *lx) {
61 cl_int err;
62
63 if (cdtp_program == NULL)
65
66 const size_t global_item_size = 256 * (*nel);
67 const size_t local_item_size = 256;
68
69 size_t global_kstep[2];
70 size_t local_kstep[2];
71 local_kstep[0] = (*lx);
72 local_kstep[1] = (*lx);
73 global_kstep[0] = (*nel) * (*lx);
74 global_kstep[1] = (*lx);
75
76 if (autotune_cdtp == NULL) {
77 autotune_cdtp = malloc(17 * sizeof(int));
78 memset(autotune_cdtp, 0, 17 * sizeof(int));
79 }
80
81#define STR(X) #X
82#define CASE_1D(LX, QUEUE, EVENT) \
83 { \
84 cl_kernel kernel = clCreateKernel(cdtp_program, \
85 STR(cdtp_kernel_lx##LX), &err); \
86 CL_CHECK(err); \
87 \
88 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
89 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
90 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
91 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
92 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
93 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
94 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
95 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
96 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
97 \
98 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
99 kernel, 1, NULL, &global_item_size, \
100 &local_item_size, 0, NULL, EVENT)); \
101 }
102
103
104#define CASE_KSTEP(LX, QUEUE, EVENT) \
105 { \
106 cl_kernel kernel = clCreateKernel(cdtp_program, \
107 STR(cdtp_kernel_kstep_lx##LX), &err); \
108 CL_CHECK(err); \
109 \
110 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dtx)); \
111 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &x)); \
112 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &dr)); \
113 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &ds)); \
114 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &dt)); \
115 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &dxt)); \
116 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &dyt)); \
117 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &dzt)); \
118 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3)); \
119 \
120 CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) QUEUE, \
121 kernel, 2, NULL, global_kstep, \
122 local_kstep, 0, NULL, EVENT)); \
123 }
124
125
126#define CASE(LX) \
127 case LX: \
128 if(autotune_cdtp[LX] == 0 ) { \
129 char *env_value = NULL; \
130 char neko_log_buf[80]; \
131 env_value = getenv("NEKO_AUTOTUNE"); \
132 \
133 sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx); \
134 log_section(neko_log_buf); \
135 if(env_value) { \
136 if( !strcmp(env_value,"1D") ) { \
137 CASE_1D(LX, glb_cmd_queue, NULL); \
138 sprintf(neko_log_buf,"Set by env : 1 (1D)"); \
139 log_message(neko_log_buf); \
140 autotune_cdtp[LX] = 1; \
141 } else if( !strcmp(env_value,"KSTEP") ) { \
142 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
143 sprintf(neko_log_buf,"Set by env : 2 (KSTEP)"); \
144 log_message(neko_log_buf); \
145 autotune_cdtp[LX] = 2; \
146 } else { \
147 sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE"); \
148 log_error(neko_log_buf); \
149 } \
150 } \
151 else { \
152 CL_CHECK(clFinish(glb_cmd_queue)); \
153 cl_event perf_event, sync_event; \
154 cl_ulong start, end; \
155 CL_CHECK(clEnqueueMarker(glb_cmd_queue, &sync_event)); \
156 CL_CHECK(clEnqueueBarrier(prf_cmd_queue)); \
157 CL_CHECK(clEnqueueWaitForEvents(prf_cmd_queue, 1, &sync_event)); \
158 \
159 double elapsed1 = 0.0; \
160 for(int i = 0; i < 100; i++) { \
161 CASE_1D(LX, prf_cmd_queue, &perf_event); \
162 CL_CHECK(clWaitForEvents(1, &perf_event)); \
163 CL_CHECK(clGetEventProfilingInfo(perf_event, \
164 CL_PROFILING_COMMAND_START, \
165 sizeof(cl_ulong), &start, NULL)); \
166 CL_CHECK(clGetEventProfilingInfo(perf_event, \
167 CL_PROFILING_COMMAND_END, \
168 sizeof(cl_ulong), &end, NULL)); \
169 elapsed1 += (end - start)*1.0e-6; \
170 } \
171 \
172 double elapsed2 = 0.0; \
173 for(int i = 0; i < 100; i++) { \
174 CASE_KSTEP(LX, prf_cmd_queue, &perf_event); \
175 CL_CHECK(clWaitForEvents(1, &perf_event)); \
176 CL_CHECK(clGetEventProfilingInfo(perf_event, \
177 CL_PROFILING_COMMAND_START, \
178 sizeof(cl_ulong), &start, NULL)); \
179 CL_CHECK(clGetEventProfilingInfo(perf_event, \
180 CL_PROFILING_COMMAND_END, \
181 sizeof(cl_ulong), &end, NULL)); \
182 elapsed2 += (end - start)*1.0e-6; \
183 } \
184 \
185 CL_CHECK(clFinish(prf_cmd_queue)); \
186 CL_CHECK(clEnqueueMarker(prf_cmd_queue, &sync_event)); \
187 int krnl_strtgy = (elapsed1 < elapsed2 ? 1 : 2); \
188 sprintf(neko_log_buf, "Chose : %d (%s)", krnl_strtgy, \
189 (krnl_strtgy > 1 ? "KSTEP" : "1D")); \
190 autotune_cdtp[LX] = krnl_strtgy; \
191 log_message(neko_log_buf); \
192 clEnqueueBarrier(glb_cmd_queue); \
193 clEnqueueWaitForEvents(glb_cmd_queue, 1, &sync_event) ; \
194 } \
195 log_end_section(); \
196 } else if (autotune_cdtp[LX] == 1 ) { \
197 CASE_1D(LX, glb_cmd_queue, NULL); \
198 } else if (autotune_cdtp[LX] == 2 ) { \
199 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
200 } \
201 break
202
203#define CASE_LARGE(LX) \
204 case LX: \
205 CASE_KSTEP(LX, glb_cmd_queue, NULL); \
206 break
207
208 if ((*lx) < 14) {
209 switch(*lx) {
210 CASE(2);
211 CASE(3);
212 CASE(4);
213 CASE(5);
214 CASE(6);
215 CASE(7);
216 CASE(8);
217 CASE(9);
218 CASE(10);
219 CASE(11);
220 CASE(12);
221 CASE(13);
222 default:
223 {
224 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
225 exit(1);
226 }
227 }
228 }
229 else {
230 switch(*lx) {
231 CASE_LARGE(14);
232 CASE_LARGE(15);
233 CASE_LARGE(16);
234 default:
235 {
236 fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
237 exit(1);
238 }
239 }
240 }
241}
__global__ void const T *__restrict__ const T *__restrict__ dr
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
Definition opr_cdtp.c:57
#define CASE(LX)
int * autotune_cdtp
Definition opr_cdtp.c:52
#define CASE_LARGE(LX)
void * cdtp_program