Neko  0.9.0
A portable framework for high-order spectral element flow simulations
ax_helm.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2024, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "ax_helm_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int>
47 int tune(void *w, void *u, void *dx, void *dy, void *dz,
48  void *dxt, void *dyt, void *dzt, void *h1,
49  void *g11, void *g22, void *g33, void *g12,
50  void *g13, void *g23, int *nelv, int *lx);
51 
52 template < const int>
53 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
54  void *dxt, void *dyt, void *dzt, void *h1,
55  void *g11, void *g22, void *g33, void *g12,
56  void *g13, void *g23, int *nelv, int *lx);
57 
58 extern "C" {
59 
63  void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
64  void *dxt, void *dyt, void *dzt, void *h1,
65  void *g11, void *g22, void *g33, void *g12,
66  void *g13, void *g23, int *nelv, int *lx) {
67 
68  static int autotune[17] = { 0 };
69 
70  const dim3 nthrds_1d(1024, 1, 1);
71  const dim3 nblcks_1d((*nelv), 1, 1);
72  const dim3 nthrds_kstep((*lx), (*lx), 1);
73  const dim3 nblcks_kstep((*nelv), 1, 1);
74 
75  const dim3 nthrds((*lx), (*lx), 1);
76  const dim3 nblcks((*nelv), 1, 1);
77  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
78 
79 #define CASE_1D(LX) \
80  ax_helm_kernel_1d<real, LX, 1024> \
81  <<<nblcks_1d, nthrds_1d, 0, stream>>>((real *) w, (real *) u, \
82  (real *) dx, (real *) dy, (real *) dz, \
83  (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1,\
84  (real *) g11, (real *) g22, (real *) g33, \
85  (real *) g12, (real *) g13, (real *) g23); \
86  CUDA_CHECK(cudaGetLastError());
87 
88 #define CASE_KSTEP(LX) \
89  ax_helm_kernel_kstep<real, LX> \
90  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
91  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
92  (real *) g11, (real *) g22, (real *) g33, \
93  (real *) g12, (real *) g13, (real *) g23); \
94  CUDA_CHECK(cudaGetLastError());
95 
96 #define CASE_KSTEP_PADDED(LX) \
97  ax_helm_kernel_kstep_padded<real, LX> \
98  <<<nblcks_kstep, nthrds_kstep, 0, stream>>>((real *) w, (real *) u, \
99  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
100  (real *) g11, (real *) g22, (real *) g33, \
101  (real *) g12, (real *) g13, (real *) g23); \
102  CUDA_CHECK(cudaGetLastError());
103 
104 #define CASE(LX) \
105  case LX: \
106  if(autotune[LX] == 0 ) { \
107  autotune[LX]=tune<LX>( w, u, \
108  dx, dy, dz, \
109  dxt, dyt, dzt,h1, \
110  g11, g22, g33, \
111  g12, g13, g23, nelv, lx); \
112  } else if (autotune[LX] == 1 ) { \
113  CASE_1D(LX); \
114  } else if (autotune[LX] == 2 ) { \
115  CASE_KSTEP(LX); \
116  } \
117  break
118 
119 
120 #define CASE_PADDED(LX) \
121  case LX: \
122  if(autotune[LX] == 0 ) { \
123  autotune[LX]=tune_padded<LX>(w, u, \
124  dx, dy, dz, \
125  dxt, dyt, dzt,h1, \
126  g11, g22, g33, \
127  g12, g13, g23,nelv,lx); \
128  } else if (autotune[LX] == 1 ) { \
129  CASE_1D(LX); \
130  } else if (autotune[LX] == 2 ) { \
131  CASE_KSTEP_PADDED(LX); \
132  } \
133  break
134 
135 #define CASE_LARGE(LX) \
136  case LX: \
137  CASE_KSTEP(LX); \
138  break
139 
140 #define CASE_LARGE_PADDED(LX) \
141  case LX: \
142  CASE_KSTEP_PADDED(LX); \
143  break
144 
145 
146 
147  if ((*lx) < 12) {
148  switch(*lx) {
149  CASE(2);
150  CASE(3);
151  CASE_PADDED(4);
152  CASE(5);
153  CASE(6);
154  CASE(7);
155  CASE_PADDED(8);
156  CASE(9);
157  CASE(10);
158  CASE(11);
159  default:
160  {
161  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
162  exit(1);
163  }
164  }
165  }
166  else {
167  switch(*lx) {
168  CASE_LARGE(12);
169  CASE_LARGE(13);
170  CASE_LARGE(14);
171  CASE_LARGE(15);
172  CASE_LARGE_PADDED(16);
173  default:
174  {
175  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
176  exit(1);
177  }
178  }
179  }
180  }
181 
185  void cuda_ax_helm_vector(void *au, void *av, void *aw,
186  void *u, void *v, void *w,
187  void *dx, void *dy, void *dz,
188  void *dxt, void *dyt, void *dzt,
189  void *h1, void *g11, void *g22,
190  void *g33, void *g12, void *g13,
191  void *g23, int *nelv, int *lx) {
192 
193  const dim3 nthrds((*lx), (*lx), 1);
194  const dim3 nblcks((*nelv), 1, 1);
195  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
196 
197 #define CASE_VECTOR_KSTEP(LX) \
198  ax_helm_kernel_vector_kstep<real, LX> \
199  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
200  (real *) u, (real *) v, (real *) w, \
201  (real *) dx, (real *) dy, (real *) dz, \
202  (real *) h1, (real *) g11, (real *) g22, \
203  (real *) g33, (real *) g12, (real *) g13, \
204  (real *) g23); \
205  CUDA_CHECK(cudaGetLastError());
206 
207 #define CASE_VECTOR_KSTEP_PADDED(LX) \
208  ax_helm_kernel_vector_kstep_padded<real, LX> \
209  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw, \
210  (real *) u, (real *) v, (real *) w, \
211  (real *) dx, (real *) dy, (real *) dz, \
212  (real *) h1, (real *) g11, (real *) g22, \
213  (real *) g33, (real *) g12, (real *) g13, \
214  (real *) g23); \
215  CUDA_CHECK(cudaGetLastError());
216 
217 
218 #define CASE_VECTOR(LX) \
219  case LX: \
220  CASE_VECTOR_KSTEP(LX); \
221  break
222 
223 #define CASE_VECTOR_PADDED(LX) \
224  case LX: \
225  CASE_VECTOR_KSTEP_PADDED(LX); \
226  break
227 
228  switch(*lx) {
229  CASE_VECTOR(2);
230  CASE_VECTOR(3);
232  CASE_VECTOR(5);
233  CASE_VECTOR(6);
234  CASE_VECTOR(7);
236  CASE_VECTOR(9);
237  CASE_VECTOR(10);
238  CASE_VECTOR(11);
239  CASE_VECTOR(12);
240  CASE_VECTOR(13);
241  CASE_VECTOR(14);
242  CASE_VECTOR(15);
243  CASE_VECTOR_PADDED(16);
244  default:
245  {
246  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
247  exit(1);
248  }
249  }
250  }
251 
255  void cuda_ax_helm_vector_part2(void *au, void *av, void *aw,
256  void *u, void *v, void *w,
257  void *h2, void *B, int *n) {
258 
259  const dim3 nthrds(1024, 1, 1);
260  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
261  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
262 
263  ax_helm_kernel_vector_part2<real>
264  <<<nblcks, nthrds, 0, stream>>> ((real *) au, (real *) av, (real *) aw,
265  (real *) u, (real *) v, (real *) w,
266  (real *) h2, (real *) B, *n);
267  }
268 }
269 
270 template < const int LX >
271 int tune(void *w, void *u, void *dx, void *dy, void *dz,
272  void *dxt, void *dyt, void *dzt, void *h1,
273  void *g11, void *g22, void *g33, void *g12,
274  void *g13, void *g23, int *nelv, int *lx) {
275  cudaEvent_t start,stop;
276  float time1,time2;
277  int retval;
278 
279  const dim3 nthrds_1d(1024, 1, 1);
280  const dim3 nblcks_1d((*nelv), 1, 1);
281  const dim3 nthrds_kstep((*lx), (*lx), 1);
282  const dim3 nblcks_kstep((*nelv), 1, 1);
283  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
284 
285  char *env_value = NULL;
286  char neko_log_buf[80];
287 
288  env_value=getenv("NEKO_AUTOTUNE");
289 
290  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
291  log_section(neko_log_buf);
292 
293  if(env_value) {
294  if( !strcmp(env_value,"1D") ) {
295  CASE_1D(LX);
296  sprintf(neko_log_buf,"Set by env : 1 (1D)");
297  log_message(neko_log_buf);
298  log_end_section();
299  return 1;
300  } else if( !strcmp(env_value,"KSTEP") ) {
301  CASE_KSTEP(LX);
302  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
303  log_message(neko_log_buf);
304  log_end_section();
305  return 2;
306  } else {
307  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
308  log_error(neko_log_buf);
309  }
310  }
311 
312  cudaEventCreate(&start);
313  cudaEventCreate(&stop);
314 
315  cudaEventRecord(start,0);
316 
317  for(int i = 0; i < 100; i++) {
318  CASE_1D(LX);
319  }
320 
321  cudaEventRecord(stop,0);
322  cudaEventSynchronize(stop);
323  cudaEventElapsedTime(&time1, start, stop);
324 
325  cudaEventRecord(start,0);
326 
327  for(int i = 0; i < 100; i++) {
328  CASE_KSTEP(LX);
329  }
330 
331  cudaEventRecord(stop,0);
332  cudaEventSynchronize(stop);
333  cudaEventElapsedTime(&time2, start, stop);
334 
335  if(time1 < time2) {
336  retval = 1;
337  } else {
338  retval = 2;
339  }
340 
341  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
342  (retval > 1 ? "KSTEP" : "1D"));
343  log_message(neko_log_buf);
344  log_end_section();
345  return retval;
346 }
347 
348 template < const int LX >
349 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
350  void *dxt, void *dyt, void *dzt, void *h1,
351  void *g11, void *g22, void *g33, void *g12,
352  void *g13, void *g23, int *nelv, int *lx) {
353  cudaEvent_t start, stop;
354  float time1, time2;
355  int retval;
356 
357  const dim3 nthrds_1d(1024, 1, 1);
358  const dim3 nblcks_1d((*nelv), 1, 1);
359  const dim3 nthrds_kstep((*lx), (*lx), 1);
360  const dim3 nblcks_kstep((*nelv), 1, 1);
361  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
362 
363  char *env_value = NULL;
364  char neko_log_buf[80];
365 
366  env_value=getenv("NEKO_AUTOTUNE");
367 
368  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
369  log_section(neko_log_buf);
370 
371  if(env_value) {
372  if( !strcmp(env_value,"1D") ) {
373  CASE_1D(LX);
374  sprintf(neko_log_buf,"Set by env : 1 (1D)");
375  log_message(neko_log_buf);
376  log_end_section();
377  return 1;
378  } else if( !strcmp(env_value,"KSTEP") ) {
379  CASE_KSTEP(LX);
380  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
381  log_message(neko_log_buf);
382  log_end_section();
383  return 2;
384  } else {
385  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
386  log_error(neko_log_buf);
387  }
388  }
389 
390  cudaEventCreate(&start);
391  cudaEventCreate(&stop);
392 
393  cudaEventRecord(start,0);
394 
395  for(int i = 0; i < 100; i++) {
396  CASE_1D(LX);
397  }
398 
399  cudaEventRecord(stop, 0);
400  cudaEventSynchronize(stop);
401  cudaEventElapsedTime(&time1, start, stop);
402 
403  cudaEventRecord(start, 0);
404 
405  for(int i = 0; i < 100; i++) {
406  CASE_KSTEP_PADDED(LX);
407  }
408 
409  cudaEventRecord(stop, 0);
410  cudaEventSynchronize(stop);
411  cudaEventElapsedTime(&time2, start, stop);
412 
413  if(time1 < time2) {
414  retval=1;
415  } else {
416  retval=2;
417  }
418 
419  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
420  (retval > 1 ? "KSTEP" : "1D"));
421  log_message(neko_log_buf);
422  log_end_section();
423  return retval;
424 }
#define CASE_VECTOR(LX)
void cuda_ax_helm_vector_part2(void *au, void *av, void *aw, void *u, void *v, void *w, void *h2, void *B, int *n)
Definition: ax_helm.cu:255
#define CASE(LX)
#define CASE_KSTEP(LX)
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:271
void cuda_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:63
#define CASE_PADDED(LX)
void cuda_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:185
#define CASE_LARGE(LX)
#define CASE_1D(LX)
#define CASE_KSTEP_PADDED(LX)
#define CASE_LARGE_PADDED(LX)
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.cu:349
#define CASE_VECTOR_PADDED(LX)
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:112
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:111
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:110
double real
Definition: device_config.h:12
void * glb_cmd_queue
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)