Neko  0.8.99
A portable framework for high-order spectral element flow simulations
ax_helm.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2024, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "ax_helm_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int>
47 int tune(void *w, void *u, void *dx, void *dy, void *dz,
48  void *dxt, void *dyt, void *dzt, void *h1,
49  void *g11, void *g22, void *g33, void *g12,
50  void *g13, void *g23, int *nelv, int *lx);
51 
52 template < const int>
53 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
54  void *dxt, void *dyt, void *dzt, void *h1,
55  void *g11, void *g22, void *g33, void *g12,
56  void *g13, void *g23, int *nelv, int *lx);
57 
58 extern "C" {
59 
63  void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
64  void *dxt, void *dyt, void *dzt, void *h1,
65  void *g11, void *g22, void *g33, void *g12,
66  void *g13, void *g23, int *nelv, int *lx) {
67 
68  static int autotune[17] = { 0 };
69 
70  const dim3 nthrds_1d(1024, 1, 1);
71  const dim3 nblcks_1d((*nelv), 1, 1);
72  const dim3 nthrds_kstep((*lx), (*lx), 1);
73  const dim3 nblcks_kstep((*nelv), 1, 1);
74 
75  const dim3 nthrds((*lx), (*lx), 1);
76  const dim3 nblcks((*nelv), 1, 1);
77  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
78 
79 #define CASE_1D(LX) \
80  ax_helm_kernel_1d<real, LX, 1024> \
81  <<<nblcks_1d, nthrds_1d, 0, stream>>>((real *) w, (real *) u, \
82  (real *) dx, (real *) dy, (real *) dz, \
83  (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1,\
84  (real *) g11, (real *) g22, (real *) g33, \
85  (real *) g12, (real *) g13, (real *) g23); \
86  CUDA_CHECK(cudaGetLastError());
87 
88 #define CASE_KSTEP(LX) \
89  ax_helm_kernel_kstep<real, LX> \
90  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
91  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
92  (real *) g11, (real *) g22, (real *) g33, \
93  (real *) g12, (real *) g13, (real *) g23); \
94  CUDA_CHECK(cudaGetLastError());
95 
96 #define CASE_KSTEP_PADDED(LX) \
97  ax_helm_kernel_kstep_padded<real, LX> \
98  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
99  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
100  (real *) g11, (real *) g22, (real *) g33, \
101  (real *) g12, (real *) g13, (real *) g23); \
102  CUDA_CHECK(cudaGetLastError());
103 
104 #define CASE(LX) \
105  case LX: \
106  if(autotune[LX] == 0 ) { \
107  autotune[LX]=tune<LX>( w, u, \
108  dx, dy, dz, \
109  dxt, dyt, dzt,h1, \
110  g11, g22, g33, \
111  g12, g13, g23, nelv, lx); \
112  } else if (autotune[LX] == 1 ) { \
113  CASE_1D(LX); \
114  } else if (autotune[LX] == 2 ) { \
115  CASE_KSTEP(LX); \
116  } \
117  break
118 
119 
120 #define CASE_PADDED(LX) \
121  case LX: \
122  if(autotune[LX] == 0 ) { \
123  autotune[LX]=tune_padded<LX>(w, u, \
124  dx, dy, dz, \
125  dxt, dyt, dzt,h1, \
126  g11, g22, g33, \
127  g12, g13, g23,nelv,lx); \
128  } else if (autotune[LX] == 1 ) { \
129  CASE_1D(LX); \
130  } else if (autotune[LX] == 2 ) { \
131  CASE_KSTEP_PADDED(LX); \
132  } \
133  break
134 
135 #define CASE_LARGE(LX) \
136  case LX: \
137  CASE_KSTEP(LX); \
138  break
139 
140 #define CASE_LARGE_PADDED(LX) \
141  case LX: \
142  CASE_KSTEP_PADDED(LX); \
143  break
144 
145 
146  if ((*lx) < 12) {
147  switch(*lx) {
148  CASE(2);
149  CASE(3);
150  CASE_PADDED(4);
151  CASE(5);
152  CASE(6);
153  CASE(7);
154  CASE_PADDED(8);
155  CASE(9);
156  CASE(10);
157  CASE(11);
158  default:
159  {
160  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
161  exit(1);
162  }
163  }
164  }
165  else {
166  switch(*lx) {
167  CASE_LARGE(12);
168  CASE_LARGE(13);
169  CASE_LARGE(14);
170  CASE_LARGE(15);
171  CASE_LARGE_PADDED(16);
172  default:
173  {
174  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
175  exit(1);
176  }
177  }
178  }
179  }
180 
184  void cuda_ax_helm_vector(void *au, void *av, void *aw,
185  void *u, void *v, void *w,
186  void *dx, void *dy, void *dz,
187  void *dxt, void *dyt, void *dzt,
188  void *h1, void *g11, void *g22,
189  void *g33, void *g12, void *g13,
190  void *g23, int *nelv, int *lx) {
191 
192  const dim3 nthrds((*lx), (*lx), 1);
193  const dim3 nblcks((*nelv), 1, 1);
194  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
195 
196 #define CASE_VECTOR_KSTEP(LX) \
197  ax_helm_kernel_vector_kstep<real, LX> \
198  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
199  (real *) u, (real *) v, (real *) w, \
200  (real *) dx, (real *) dy, (real *) dz, \
201  (real *) h1, (real *) g11, (real *) g22, \
202  (real *) g33, (real *) g12, (real *) g13, \
203  (real *) g23); \
204  CUDA_CHECK(cudaGetLastError());
205 
206 #define CASE_VECTOR_KSTEP_PADDED(LX) \
207  ax_helm_kernel_vector_kstep_padded<real, LX> \
208  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
209  (real *) u, (real *) v, (real *) w, \
210  (real *) dx, (real *) dy, (real *) dz, \
211  (real *) h1, (real *) g11, (real *) g22, \
212  (real *) g33, (real *) g12, (real *) g13, \
213  (real *) g23); \
214  CUDA_CHECK(cudaGetLastError());
215 
216 
217 #define CASE_VECTOR(LX) \
218  case LX: \
219  CASE_VECTOR_KSTEP(LX); \
220  break
221 
222 #define CASE_VECTOR_PADDED(LX) \
223  case LX: \
224  CASE_VECTOR_KSTEP_PADDED(LX); \
225  break
226 
227  switch(*lx) {
228  CASE_VECTOR(2);
229  CASE_VECTOR(3);
231  CASE_VECTOR(5);
232  CASE_VECTOR(6);
233  CASE_VECTOR(7);
235  CASE_VECTOR(9);
236  CASE_VECTOR(10);
237  CASE_VECTOR(11);
238  CASE_VECTOR(12);
239  CASE_VECTOR(13);
240  CASE_VECTOR(14);
241  CASE_VECTOR(15);
242  CASE_VECTOR_PADDED(16);
243  default:
244  {
245  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
246  exit(1);
247  }
248  }
249  }
250 
254  void cuda_ax_helm_vector_part2(void *au, void *av, void *aw,
255  void *u, void *v, void *w,
256  void *h2, void *B, int *n) {
257 
258  const dim3 nthrds(1024, 1, 1);
259  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
260  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
261 
262  ax_helm_kernel_vector_part2<real>
263  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw,
264  (real *) u, (real *) v, (real *) w,
265  (real *) h2, (real *) B, *n);
266  }
267 }
268 
269 template < const int LX >
270 int tune(void *w, void *u, void *dx, void *dy, void *dz,
271  void *dxt, void *dyt, void *dzt, void *h1,
272  void *g11, void *g22, void *g33, void *g12,
273  void *g13, void *g23, int *nelv, int *lx) {
274  cudaEvent_t start,stop;
275  float time1,time2;
276  int retval;
277 
278  const dim3 nthrds_1d(1024, 1, 1);
279  const dim3 nblcks_1d((*nelv), 1, 1);
280  const dim3 nthrds_kstep((*lx), (*lx), 1);
281  const dim3 nblcks_kstep((*nelv), 1, 1);
282  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
283 
284  char *env_value = NULL;
285  char neko_log_buf[80];
286 
287  env_value=getenv("NEKO_AUTOTUNE");
288 
289  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
290  log_section(neko_log_buf);
291 
292  if(env_value) {
293  if( !strcmp(env_value,"1D") ) {
294  CASE_1D(LX);
295  sprintf(neko_log_buf,"Set by env : 1 (1D)");
296  log_message(neko_log_buf);
297  log_end_section();
298  return 1;
299  } else if( !strcmp(env_value,"KSTEP") ) {
300  CASE_KSTEP(LX);
301  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
302  log_message(neko_log_buf);
303  log_end_section();
304  return 2;
305  } else {
306  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
307  log_error(neko_log_buf);
308  }
309  }
310 
311  cudaEventCreate(&start);
312  cudaEventCreate(&stop);
313 
314  cudaEventRecord(start,0);
315 
316  for(int i = 0; i < 100; i++) {
317  CASE_1D(LX);
318  }
319 
320  cudaEventRecord(stop,0);
321  cudaEventSynchronize(stop);
322  cudaEventElapsedTime(&time1, start, stop);
323 
324  cudaEventRecord(start,0);
325 
326  for(int i = 0; i < 100; i++) {
327  CASE_KSTEP(LX);
328  }
329 
330  cudaEventRecord(stop,0);
331  cudaEventSynchronize(stop);
332  cudaEventElapsedTime(&time2, start, stop);
333 
334  if(time1 < time2) {
335  retval = 1;
336  } else {
337  retval = 2;
338  }
339 
340  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
341  (retval > 1 ? "KSTEP" : "1D"));
342  log_message(neko_log_buf);
343  log_end_section();
344  return retval;
345 }
346 
347 template < const int LX >
348 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
349  void *dxt, void *dyt, void *dzt, void *h1,
350  void *g11, void *g22, void *g33, void *g12,
351  void *g13, void *g23, int *nelv, int *lx) {
352  cudaEvent_t start, stop;
353  float time1, time2;
354  int retval;
355 
356  const dim3 nthrds_1d(1024, 1, 1);
357  const dim3 nblcks_1d((*nelv), 1, 1);
358  const dim3 nthrds_kstep((*lx), (*lx), 1);
359  const dim3 nblcks_kstep((*nelv), 1, 1);
360  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
361 
362  char *env_value = NULL;
363  char neko_log_buf[80];
364 
365  env_value=getenv("NEKO_AUTOTUNE");
366 
367  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
368  log_section(neko_log_buf);
369 
370  if(env_value) {
371  if( !strcmp(env_value,"1D") ) {
372  CASE_1D(LX);
373  sprintf(neko_log_buf,"Set by env : 1 (1D)");
374  log_message(neko_log_buf);
375  log_end_section();
376  return 1;
377  } else if( !strcmp(env_value,"KSTEP") ) {
378  CASE_KSTEP(LX);
379  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
380  log_message(neko_log_buf);
381  log_end_section();
382  return 2;
383  } else {
384  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
385  log_error(neko_log_buf);
386  }
387  }
388 
389  cudaEventCreate(&start);
390  cudaEventCreate(&stop);
391 
392  cudaEventRecord(start,0);
393 
394  for(int i = 0; i < 100; i++) {
395  CASE_1D(LX);
396  }
397 
398  cudaEventRecord(stop, 0);
399  cudaEventSynchronize(stop);
400  cudaEventElapsedTime(&time1, start, stop);
401 
402  cudaEventRecord(start, 0);
403 
404  for(int i = 0; i < 100; i++) {
405  CASE_KSTEP_PADDED(LX);
406  }
407 
408  cudaEventRecord(stop, 0);
409  cudaEventSynchronize(stop);
410  cudaEventElapsedTime(&time2, start, stop);
411 
412  if(time1 < time2) {
413  retval=1;
414  } else {
415  retval=2;
416  }
417 
418  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
419  (retval > 1 ? "KSTEP" : "1D"));
420  log_message(neko_log_buf);
421  log_end_section();
422  return retval;
423 }
#define CASE_VECTOR(LX)
void cuda_ax_helm_vector_part2(void *au, void *av, void *aw, void *u, void *v, void *w, void *h2, void *B, int *n)
Definition: ax_helm.cu:254
#define CASE(LX)
#define CASE_KSTEP(LX)
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:270
void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:63
#define CASE_PADDED(LX)
void cuda_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:184
#define CASE_LARGE(LX)
#define CASE_1D(LX)
#define CASE_KSTEP_PADDED(LX)
#define CASE_LARGE_PADDED(LX)
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:348
#define CASE_VECTOR_PADDED(LX)
const int i
Definition: cdtp_kernel.h:128
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:112
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:111
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:110
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
Definition: conv1_kernel.h:136
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
Definition: conv1_kernel.h:138
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
Definition: conv1_kernel.h:137
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
double real
Definition: device_config.h:12
void * glb_cmd_queue
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void T *__restrict__ av
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)