Neko  0.8.1
A portable framework for high-order spectral element flow simulations
ax_helm.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "ax_helm_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int>
47 int tune(void *w, void *u, void *dx, void *dy, void *dz,
48  void *dxt, void *dyt, void *dzt, void *h1,
49  void *g11, void *g22, void *g33, void *g12,
50  void *g13, void *g23, int *nelv, int *lx);
51 
52 template < const int>
53 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
54  void *dxt, void *dyt, void *dzt, void *h1,
55  void *g11, void *g22, void *g33, void *g12,
56  void *g13, void *g23, int *nelv, int *lx);
57 
58 extern "C" {
59 
63  void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
64  void *dxt, void *dyt, void *dzt, void *h1,
65  void *g11, void *g22, void *g33, void *g12,
66  void *g13, void *g23, int *nelv, int *lx) {
67 
68  static int autotune[17] = { 0 };
69 
70  const dim3 nthrds_1d(1024, 1, 1);
71  const dim3 nblcks_1d((*nelv), 1, 1);
72  const dim3 nthrds_kstep((*lx), (*lx), 1);
73  const dim3 nblcks_kstep((*nelv), 1, 1);
74 
75  const dim3 nthrds((*lx), (*lx), 1);
76  const dim3 nblcks((*nelv), 1, 1);
77  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
78 
79 #define CASE_1D(LX) \
80  ax_helm_kernel_1d<real, LX, 1024> \
81  <<<nblcks_1d, nthrds_1d, 0, stream>>>((real *) w, (real *) u, \
82  (real *) dx, (real *) dy, (real *) dz, \
83  (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1,\
84  (real *) g11, (real *) g22, (real *) g33, \
85  (real *) g12, (real *) g13, (real *) g23); \
86  CUDA_CHECK(cudaGetLastError());
87 
88 #define CASE_KSTEP(LX) \
89  ax_helm_kernel_kstep<real, LX> \
90  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
91  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
92  (real *) g11, (real *) g22, (real *) g33, \
93  (real *) g12, (real *) g13, (real *) g23); \
94  CUDA_CHECK(cudaGetLastError());
95 
96 #define CASE_KSTEP_PADDED(LX) \
97  ax_helm_kernel_kstep_padded<real, LX> \
98  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
99  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
100  (real *) g11, (real *) g22, (real *) g33, \
101  (real *) g12, (real *) g13, (real *) g23); \
102  CUDA_CHECK(cudaGetLastError());
103 
104 #define CASE(LX) \
105  case LX: \
106  if(autotune[LX] == 0 ) { \
107  autotune[LX]=tune<LX>( w, u, \
108  dx, dy, dz, \
109  dxt, dyt, dzt,h1, \
110  g11, g22, g33, \
111  g12, g13, g23, nelv, lx); \
112  } else if (autotune[LX] == 1 ) { \
113  CASE_1D(LX); \
114  } else if (autotune[LX] == 2 ) { \
115  CASE_KSTEP(LX); \
116  } \
117  break
118 
119 
120 #define CASE_PADDED(LX) \
121  case LX: \
122  if(autotune[LX] == 0 ) { \
123  autotune[LX]=tune_padded<LX>(w, u, \
124  dx, dy, dz, \
125  dxt, dyt, dzt,h1, \
126  g11, g22, g33, \
127  g12, g13, g23,nelv,lx); \
128  } else if (autotune[LX] == 1 ) { \
129  CASE_1D(LX); \
130  } else if (autotune[LX] == 2 ) { \
131  CASE_KSTEP_PADDED(LX); \
132  } \
133  break
134 
135 #define CASE_LARGE(LX) \
136  case LX: \
137  CASE_KSTEP(LX); \
138  break
139 
140 #define CASE_LARGE_PADDED(LX) \
141  case LX: \
142  CASE_KSTEP_PADDED(LX); \
143  break
144 
145 
146  if ((*lx) < 12) {
147  switch(*lx) {
148  CASE(2);
149  CASE(3);
150  CASE_PADDED(4);
151  CASE(5);
152  CASE(6);
153  CASE(7);
154  CASE_PADDED(8);
155  CASE(9);
156  CASE(10);
157  CASE(11);
158  default:
159  {
160  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
161  exit(1);
162  }
163  }
164  }
165  else {
166  switch(*lx) {
167  CASE_LARGE(12);
168  CASE_LARGE(13);
169  CASE_LARGE(14);
170  CASE_LARGE(15);
171  CASE_LARGE_PADDED(16);
172  default:
173  {
174  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
175  exit(1);
176  }
177  }
178  }
179  }
180 }
181 
182 template < const int LX >
183 int tune(void *w, void *u, void *dx, void *dy, void *dz,
184  void *dxt, void *dyt, void *dzt, void *h1,
185  void *g11, void *g22, void *g33, void *g12,
186  void *g13, void *g23, int *nelv, int *lx) {
187  cudaEvent_t start,stop;
188  float time1,time2;
189  int retval;
190 
191  const dim3 nthrds_1d(1024, 1, 1);
192  const dim3 nblcks_1d((*nelv), 1, 1);
193  const dim3 nthrds_kstep((*lx), (*lx), 1);
194  const dim3 nblcks_kstep((*nelv), 1, 1);
195  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
196 
197  char *env_value = NULL;
198  char neko_log_buf[80];
199 
200  env_value=getenv("NEKO_AUTOTUNE");
201 
202  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
203  log_section(neko_log_buf);
204 
205  if(env_value) {
206  if( !strcmp(env_value,"1D") ) {
207  CASE_1D(LX);
208  sprintf(neko_log_buf,"Set by env : 1 (1D)");
209  log_message(neko_log_buf);
210  log_end_section();
211  return 1;
212  } else if( !strcmp(env_value,"KSTEP") ) {
213  CASE_KSTEP(LX);
214  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
215  log_message(neko_log_buf);
216  log_end_section();
217  return 2;
218  } else {
219  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
220  log_error(neko_log_buf);
221  }
222  }
223 
224  cudaEventCreate(&start);
225  cudaEventCreate(&stop);
226 
227  cudaEventRecord(start,0);
228 
229  for(int i = 0; i < 100; i++) {
230  CASE_1D(LX);
231  }
232 
233  cudaEventRecord(stop,0);
234  cudaEventSynchronize(stop);
235  cudaEventElapsedTime(&time1, start, stop);
236 
237  cudaEventRecord(start,0);
238 
239  for(int i = 0; i < 100; i++) {
240  CASE_KSTEP(LX);
241  }
242 
243  cudaEventRecord(stop,0);
244  cudaEventSynchronize(stop);
245  cudaEventElapsedTime(&time2, start, stop);
246 
247  if(time1 < time2) {
248  retval = 1;
249  } else {
250  retval = 2;
251  }
252 
253  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
254  (retval > 1 ? "KSTEP" : "1D"));
255  log_message(neko_log_buf);
256  log_end_section();
257  return retval;
258 }
259 
260 template < const int LX >
261 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
262  void *dxt, void *dyt, void *dzt, void *h1,
263  void *g11, void *g22, void *g33, void *g12,
264  void *g13, void *g23, int *nelv, int *lx) {
265  cudaEvent_t start, stop;
266  float time1, time2;
267  int retval;
268 
269  const dim3 nthrds_1d(1024, 1, 1);
270  const dim3 nblcks_1d((*nelv), 1, 1);
271  const dim3 nthrds_kstep((*lx), (*lx), 1);
272  const dim3 nblcks_kstep((*nelv), 1, 1);
273  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
274 
275  char *env_value = NULL;
276  char neko_log_buf[80];
277 
278  env_value=getenv("NEKO_AUTOTUNE");
279 
280  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
281  log_section(neko_log_buf);
282 
283  if(env_value) {
284  if( !strcmp(env_value,"1D") ) {
285  CASE_1D(LX);
286  sprintf(neko_log_buf,"Set by env : 1 (1D)");
287  log_message(neko_log_buf);
288  log_end_section();
289  return 1;
290  } else if( !strcmp(env_value,"KSTEP") ) {
291  CASE_KSTEP(LX);
292  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
293  log_message(neko_log_buf);
294  log_end_section();
295  return 2;
296  } else {
297  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
298  log_error(neko_log_buf);
299  }
300  }
301 
302  cudaEventCreate(&start);
303  cudaEventCreate(&stop);
304 
305  cudaEventRecord(start,0);
306 
307  for(int i = 0; i < 100; i++) {
308  CASE_1D(LX);
309  }
310 
311  cudaEventRecord(stop, 0);
312  cudaEventSynchronize(stop);
313  cudaEventElapsedTime(&time1, start, stop);
314 
315  cudaEventRecord(start, 0);
316 
317  for(int i = 0; i < 100; i++) {
318  CASE_KSTEP_PADDED(LX);
319  }
320 
321  cudaEventRecord(stop, 0);
322  cudaEventSynchronize(stop);
323  cudaEventElapsedTime(&time2, start, stop);
324 
325  if(time1 < time2) {
326  retval=1;
327  } else {
328  retval=2;
329  }
330 
331  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
332  (retval > 1 ? "KSTEP" : "1D"));
333  log_message(neko_log_buf);
334  log_end_section();
335  return retval;
336 }
#define CASE(LX)
#define CASE_KSTEP(LX)
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:183
void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:63
#define CASE_PADDED(LX)
#define CASE_LARGE(LX)
#define CASE_1D(LX)
#define CASE_KSTEP_PADDED(LX)
#define CASE_LARGE_PADDED(LX)
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:261
const int i
Definition: cdtp_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:115
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:114
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:113
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
Definition: conv1_kernel.h:138
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
Definition: conv1_kernel.h:136
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
Definition: conv1_kernel.h:137
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
void * glb_cmd_queue
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)