Neko  0.9.0
A portable framework for high-order spectral element flow simulations
ax_helm.hip
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2022, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <hip/hip_runtime.h>
39 #include <device/device_config.h>
40 #include <device/hip/check.h>
41 #include "ax_helm_kernel.h"
42 
43 extern "C" {
44  #include <common/neko_log.h>
45 }
46 
47 template < const int>
48 int tune(void *w, void *u, void *dx, void *dy, void *dz,
49  void *dxt, void *dyt, void *dzt, void *h1,
50  void *g11, void *g22, void *g33, void *g12,
51  void *g13, void *g23, int *nelv, int *lx);
52 
53 template < const int>
54 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
55  void *dxt, void *dyt, void *dzt, void *h1,
56  void *g11, void *g22, void *g33, void *g12,
57  void *g13, void *g23, int *nelv, int *lx);
58 
59 extern "C" {
60 
64  void hip_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
65  void *dxt, void *dyt, void *dzt, void *h1,
66  void *g11, void *g22, void *g33, void *g12,
67  void *g13, void *g23, int *nelv, int *lx) {
68 
69  static int autotune[13] = { 0 };
70 
71  const dim3 nthrds_1d(1024, 1, 1);
72  const dim3 nblcks_1d((*nelv), 1, 1);
73  const dim3 nthrds_kstep((*lx), (*lx), 1);
74  const dim3 nblcks_kstep((*nelv), 1, 1);
75 
76 #define CASE_1D(LX) \
77  hipLaunchKernelGGL(HIP_KERNEL_NAME( ax_helm_kernel_1d<real, LX, 1024> ), \
78  nblcks_1d, nthrds_1d, 0, \
79  (hipStream_t) glb_cmd_queue, \
80  (real *) w, (real *) u, \
81  (real *) dx, (real *) dy, (real *) dz, \
82  (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1, \
83  (real *) g11, (real *) g22, (real *) g33, \
84  (real *) g12, (real *) g13, (real *) g23); \
85  HIP_CHECK(hipGetLastError());
86 
87 #define CASE_KSTEP(LX) \
88  hipLaunchKernelGGL( HIP_KERNEL_NAME( ax_helm_kernel_kstep<real, LX> ), \
89  nblcks_kstep, nthrds_kstep, 0, \
90  (hipStream_t) glb_cmd_queue, \
91  (real *) w, (real *) u, \
92  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
93  (real *) g11, (real *) g22, (real *) g33, \
94  (real *) g12, (real *) g13, (real *) g23); \
95  HIP_CHECK(hipGetLastError());
96 
97 
98 
99 #define CASE_KSTEP_PADDED(LX) \
100  hipLaunchKernelGGL( HIP_KERNEL_NAME(ax_helm_kernel_kstep_padded<real, LX> ),\
101  nblcks_kstep, nthrds_kstep, 0, \
102  (hipStream_t) glb_cmd_queue, \
103  (real *) w, (real *) u, \
104  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
105  (real *) g11, (real *) g22, (real *) g33, \
106  (real *) g12, (real *) g13, (real *) g23); \
107  HIP_CHECK(hipGetLastError());
108 
109 #define CASE(LX) \
110  case LX: \
111  if(autotune[LX] == 0 ) { \
112  autotune[LX]=tune<LX>( w, u, \
113  dx, dy, dz, \
114  dxt, dyt, dzt,h1, \
115  g11, g22, g33, \
116  g12, g13, g23, nelv, lx); \
117  } else if (autotune[LX] == 1 ) { \
118  CASE_1D(LX); \
119  } else if (autotune[LX] == 2 ) { \
120  CASE_KSTEP(LX); \
121  } \
122  break
123 
124 
125 #define CASE_PADDED(LX) \
126  case LX: \
127  if(autotune[LX] == 0 ) { \
128  autotune[LX]=tune_padded<LX>(w, u, \
129  dx, dy, dz, \
130  dxt, dyt, dzt,h1, \
131  g11, g22, g33, \
132  g12, g13, g23,nelv,lx); \
133  } else if (autotune[LX] == 1 ) { \
134  CASE_1D(LX); \
135  } else if (autotune[LX] == 2 ) { \
136  CASE_KSTEP_PADDED(LX); \
137  } \
138  break
139 
140 
141  switch(*lx) {
142  CASE(2);
143  CASE(3);
144  CASE(4);
145  CASE(5);
146  CASE(6);
147  CASE(7);
148  CASE_PADDED(8);
149  CASE(9);
150  CASE(10);
151  CASE(11);
152  CASE(12);
153  default:
154  {
155  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
156  exit(1);
157  }
158  }
159  }
163  void hip_ax_helm_vector(void *au, void *av, void *aw,
164  void *u, void *v, void *w,
165  void *dx, void *dy, void *dz,
166  void *dxt, void *dyt, void *dzt,
167  void *h1, void *g11, void *g22,
168  void *g33, void *g12, void *g13,
169  void *g23, int *nelv, int *lx) {
170 
171  const dim3 nthrds((*lx), (*lx), 1);
172  const dim3 nblcks((*nelv), 1, 1);
173 
174 #define CASE_VECTOR_KSTEP(LX) \
175  hipLaunchKernelGGL( HIP_KERNEL_NAME( ax_helm_kernel_vector_kstep<real, LX> ), \
176  nblcks, nthrds, 0, \
177  (hipStream_t) glb_cmd_queue, \
178  (real *) au, (real *) av, (real *) aw, \
179  (real *) u, (real *) v, (real *) w, \
180  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
181  (real *) g11, (real *) g22, (real *) g33, \
182  (real *) g12, (real *) g13, (real *) g23); \
183  HIP_CHECK(hipGetLastError());
184 
185 #define CASE_VECTOR_KSTEP_PADDED(LX) \
186  hipLaunchKernelGGL( HIP_KERNEL_NAME( ax_helm_kernel_vector_kstep_padded<real, LX> ), \
187  nblcks, nthrds, 0, \
188  (hipStream_t) glb_cmd_queue, \
189  (real *) au, (real *) av, (real *) aw, \
190  (real *) u, (real *) v, (real *) w, \
191  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
192  (real *) g11, (real *) g22, (real *) g33, \
193  (real *) g12, (real *) g13, (real *) g23); \
194  HIP_CHECK(hipGetLastError());
195 
196 #define CASE_VECTOR(LX) \
197  case LX: \
198  CASE_VECTOR_KSTEP(LX); \
199  break
200 
201 #define CASE_VECTOR_PADDED(LX) \
202  case LX: \
203  CASE_VECTOR_KSTEP_PADDED(LX); \
204  break
205 
206  switch(*lx) {
207  CASE_VECTOR(2);
208  CASE_VECTOR(3);
210  CASE_VECTOR(5);
211  CASE_VECTOR(6);
212  CASE_VECTOR(7);
214  CASE_VECTOR(9);
215  CASE_VECTOR(10);
216  CASE_VECTOR(11);
217  CASE_VECTOR(12);
218  CASE_VECTOR(13);
219  CASE_VECTOR(14);
220  CASE_VECTOR(15);
221  CASE_VECTOR_PADDED(16);
222  default:
223  {
224  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
225  exit(1);
226  }
227  }
228  }
229 
233  void hip_ax_helm_vector_part2(void *au, void *av, void *aw,
234  void *u, void *v, void *w,
235  void *h2, void *B, int *n) {
236 
237  const dim3 nthrds(1024, 1, 1);
238  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
239  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
240 
241  hipLaunchKernelGGL( HIP_KERNEL_NAME( ax_helm_kernel_vector_part2<real> ),
242  nblcks, nthrds, 0, stream,
243  (real *) au, (real *) av, (real *) aw,
244  (real *) u, (real *) v, (real *) w,
245  (real *) h2, (real *) B, *n);
246  }
247 
248 }
249 
250 template < const int LX >
251 int tune(void *w, void *u, void *dx, void *dy, void *dz,
252  void *dxt, void *dyt, void *dzt, void *h1,
253  void *g11, void *g22, void *g33, void *g12,
254  void *g13, void *g23, int *nelv, int *lx) {
255  hipEvent_t start,stop;
256  float time1,time2;
257  int retval;
258 
259  const dim3 nthrds_1d(1024, 1, 1);
260  const dim3 nblcks_1d((*nelv), 1, 1);
261  const dim3 nthrds_kstep((*lx), (*lx), 1);
262  const dim3 nblcks_kstep((*nelv), 1, 1);
263 
264  char *env_value = NULL;
265  char neko_log_buf[80];
266 
267  env_value=getenv("NEKO_AUTOTUNE");
268 
269  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
270  log_section(neko_log_buf);
271 
272  if(env_value) {
273  if( !strcmp(env_value,"1D") ) {
274  CASE_1D(LX);
275  sprintf(neko_log_buf,"Set by env : 1 (1D)");
276  log_message(neko_log_buf);
277  log_end_section();
278  return 1;
279  } else if( !strcmp(env_value,"KSTEP") ) {
280  CASE_KSTEP(LX);
281  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
282  log_message(neko_log_buf);
283  log_end_section();
284  return 2;
285  } else {
286  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
287  log_error(neko_log_buf);
288  }
289  }
290 
291  HIP_CHECK(hipEventCreate(&start));
292  HIP_CHECK(hipEventCreate(&stop));
293 
294  HIP_CHECK(hipEventRecord(start,0));
295 
296  for(int i = 0; i < 100; i++) {
297  CASE_1D(LX);
298  }
299 
300  HIP_CHECK(hipEventRecord(stop,0));
301  HIP_CHECK(hipEventSynchronize(stop));
302  HIP_CHECK(hipEventElapsedTime(&time1, start, stop));
303 
304  HIP_CHECK(hipEventRecord(start,0));
305 
306  for(int i = 0; i < 100; i++) {
307  CASE_KSTEP(LX);
308  }
309 
310  HIP_CHECK(hipEventRecord(stop,0));
311  HIP_CHECK(hipEventSynchronize(stop));
312  HIP_CHECK(hipEventElapsedTime(&time2, start, stop));
313 
314  if(time1 < time2) {
315  retval = 1;
316  } else {
317  retval = 2;
318  }
319 
320  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
321  (retval > 1 ? "KSTEP" : "1D"));
322  log_message(neko_log_buf);
323  log_end_section();
324  return retval;
325 }
326 
327 template < const int LX >
328 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
329  void *dxt, void *dyt, void *dzt, void *h1,
330  void *g11, void *g22, void *g33, void *g12,
331  void *g13, void *g23, int *nelv, int *lx) {
332  hipEvent_t start, stop;
333  float time1, time2;
334  int retval;
335 
336  const dim3 nthrds_1d(1024, 1, 1);
337  const dim3 nblcks_1d((*nelv), 1, 1);
338  const dim3 nthrds_kstep((*lx), (*lx), 1);
339  const dim3 nblcks_kstep((*nelv), 1, 1);
340 
341  char *env_value = NULL;
342  char neko_log_buf[80];
343 
344  env_value=getenv("NEKO_AUTOTUNE");
345 
346  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
347  log_section(neko_log_buf);
348 
349  if(env_value) {
350  if( !strcmp(env_value,"1D") ) {
351  CASE_1D(LX);
352  sprintf(neko_log_buf,"Set by env : 1 (1D)");
353  log_message(neko_log_buf);
354  log_end_section();
355  return 1;
356  } else if( !strcmp(env_value,"KSTEP") ) {
357  CASE_KSTEP(LX);
358  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
359  log_message(neko_log_buf);
360  log_end_section();
361  return 2;
362  } else {
363  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
364  log_error(neko_log_buf);
365  }
366  }
367 
368  HIP_CHECK(hipEventCreate(&start));
369  HIP_CHECK(hipEventCreate(&stop));
370 
371  HIP_CHECK(hipEventRecord(start,0));
372 
373  for(int i = 0; i < 100; i++) {
374  CASE_1D(LX);
375  }
376 
377  HIP_CHECK(hipEventRecord(stop, 0));
378  HIP_CHECK(hipEventSynchronize(stop));
379  HIP_CHECK(hipEventElapsedTime(&time1, start, stop));
380 
381  HIP_CHECK(hipEventRecord(start, 0));
382 
383  for(int i = 0; i < 100; i++) {
384  CASE_KSTEP_PADDED(LX);
385  }
386 
387  HIP_CHECK(hipEventRecord(stop, 0));
388  HIP_CHECK(hipEventSynchronize(stop));
389  HIP_CHECK(hipEventElapsedTime(&time2, start, stop));
390 
391  if(time1 < time2) {
392  retval=1;
393  } else {
394  retval=2;
395  }
396 
397  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
398  (retval > 1 ? "KSTEP" : "1D"));
399  log_message(neko_log_buf);
400  log_end_section();
401  return retval;
402 }
#define CASE_VECTOR(LX)
#define CASE(LX)
#define CASE_KSTEP(LX)
void hip_ax_helm_vector_part2(void *au, void *av, void *aw, void *u, void *v, void *w, void *h2, void *B, int *n)
Definition: ax_helm.hip:233
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:251
#define CASE_PADDED(LX)
#define CASE_1D(LX)
#define CASE_KSTEP_PADDED(LX)
void hip_ax_helm_vector(void *au, void *av, void *aw, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:163
void hip_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:64
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:328
#define CASE_VECTOR_PADDED(LX)
__global__ void T *__restrict__ T *__restrict__ aw
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ av
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:112
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:111
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:110
double real
Definition: device_config.h:12
void * glb_cmd_queue
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
#define HIP_CHECK(err)
Definition: check.h:8
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)