Neko  0.8.1
A portable framework for high-order spectral element flow simulations
ax_helm.hip
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2022, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <hip/hip_runtime.h>
39 #include <device/device_config.h>
40 #include <device/hip/check.h>
41 #include "ax_helm_kernel.h"
42 
43 extern "C" {
44  #include <common/neko_log.h>
45 }
46 
47 template < const int>
48 int tune(void *w, void *u, void *dx, void *dy, void *dz,
49  void *dxt, void *dyt, void *dzt, void *h1,
50  void *g11, void *g22, void *g33, void *g12,
51  void *g13, void *g23, int *nelv, int *lx);
52 
53 template < const int>
54 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
55  void *dxt, void *dyt, void *dzt, void *h1,
56  void *g11, void *g22, void *g33, void *g12,
57  void *g13, void *g23, int *nelv, int *lx);
58 
59 extern "C" {
60 
64  void hip_ax_helm(void *w, void *u, void *dx, void *dy, void *dz,
65  void *dxt, void *dyt, void *dzt, void *h1,
66  void *g11, void *g22, void *g33, void *g12,
67  void *g13, void *g23, int *nelv, int *lx) {
68 
69  static int autotune[13] = { 0 };
70 
71  const dim3 nthrds_1d(1024, 1, 1);
72  const dim3 nblcks_1d((*nelv), 1, 1);
73  const dim3 nthrds_kstep((*lx), (*lx), 1);
74  const dim3 nblcks_kstep((*nelv), 1, 1);
75 
76 #define CASE_1D(LX) \
77  hipLaunchKernelGGL(HIP_KERNEL_NAME( ax_helm_kernel_1d<real, LX, 1024> ), \
78  nblcks_1d, nthrds_1d, 0, \
79  (hipStream_t) glb_cmd_queue, \
80  (real *) w, (real *) u, \
81  (real *) dx, (real *) dy, (real *) dz, \
82  (real *) dxt, (real *) dyt, (real *) dzt, (real *) h1, \
83  (real *) g11, (real *) g22, (real *) g33, \
84  (real *) g12, (real *) g13, (real *) g23); \
85  HIP_CHECK(hipGetLastError());
86 
87 #define CASE_KSTEP(LX) \
88  hipLaunchKernelGGL( HIP_KERNEL_NAME( ax_helm_kernel_kstep<real, LX> ), \
89  nblcks_kstep, nthrds_kstep, 0, \
90  (hipStream_t) glb_cmd_queue, \
91  (real *) w, (real *) u, \
92  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
93  (real *) g11, (real *) g22, (real *) g33, \
94  (real *) g12, (real *) g13, (real *) g23); \
95  HIP_CHECK(hipGetLastError());
96 
97 
98 
99 #define CASE_KSTEP_PADDED(LX) \
100  hipLaunchKernelGGL( HIP_KERNEL_NAME(ax_helm_kernel_kstep_padded<real, LX> ),\
101  nblcks_kstep, nthrds_kstep, 0, \
102  (hipStream_t) glb_cmd_queue, \
103  (real *) w, (real *) u, \
104  (real *) dx, (real *) dy, (real *) dz, (real *) h1, \
105  (real *) g11, (real *) g22, (real *) g33, \
106  (real *) g12, (real *) g13, (real *) g23); \
107  HIP_CHECK(hipGetLastError());
108 
109 #define CASE(LX) \
110  case LX: \
111  if(autotune[LX] == 0 ) { \
112  autotune[LX]=tune<LX>( w, u, \
113  dx, dy, dz, \
114  dxt, dyt, dzt,h1, \
115  g11, g22, g33, \
116  g12, g13, g23, nelv, lx); \
117  } else if (autotune[LX] == 1 ) { \
118  CASE_1D(LX); \
119  } else if (autotune[LX] == 2 ) { \
120  CASE_KSTEP(LX); \
121  } \
122  break
123 
124 
125 #define CASE_PADDED(LX) \
126  case LX: \
127  if(autotune[LX] == 0 ) { \
128  autotune[LX]=tune_padded<LX>(w, u, \
129  dx, dy, dz, \
130  dxt, dyt, dzt,h1, \
131  g11, g22, g33, \
132  g12, g13, g23,nelv,lx); \
133  } else if (autotune[LX] == 1 ) { \
134  CASE_1D(LX); \
135  } else if (autotune[LX] == 2 ) { \
136  CASE_KSTEP_PADDED(LX); \
137  } \
138  break
139 
140 
141  switch(*lx) {
142  CASE(2);
143  CASE(3);
144  CASE(4);
145  CASE(5);
146  CASE(6);
147  CASE(7);
148  CASE_PADDED(8);
149  CASE(9);
150  CASE(10);
151  CASE(11);
152  CASE(12);
153  default:
154  {
155  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
156  exit(1);
157  }
158  }
159  }
160 
161 }
162 
163 template < const int LX >
164 int tune(void *w, void *u, void *dx, void *dy, void *dz,
165  void *dxt, void *dyt, void *dzt, void *h1,
166  void *g11, void *g22, void *g33, void *g12,
167  void *g13, void *g23, int *nelv, int *lx) {
168  hipEvent_t start,stop;
169  float time1,time2;
170  int retval;
171 
172  const dim3 nthrds_1d(1024, 1, 1);
173  const dim3 nblcks_1d((*nelv), 1, 1);
174  const dim3 nthrds_kstep((*lx), (*lx), 1);
175  const dim3 nblcks_kstep((*nelv), 1, 1);
176 
177  char *env_value = NULL;
178  char neko_log_buf[80];
179 
180  env_value=getenv("NEKO_AUTOTUNE");
181 
182  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
183  log_section(neko_log_buf);
184 
185  if(env_value) {
186  if( !strcmp(env_value,"1D") ) {
187  CASE_1D(LX);
188  sprintf(neko_log_buf,"Set by env : 1 (1D)");
189  log_message(neko_log_buf);
190  log_end_section();
191  return 1;
192  } else if( !strcmp(env_value,"KSTEP") ) {
193  CASE_KSTEP(LX);
194  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
195  log_message(neko_log_buf);
196  log_end_section();
197  return 2;
198  } else {
199  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
200  log_error(neko_log_buf);
201  }
202  }
203 
204  hipEventCreate(&start);
205  hipEventCreate(&stop);
206 
207  hipEventRecord(start,0);
208 
209  for(int i = 0; i < 100; i++) {
210  CASE_1D(LX);
211  }
212 
213  hipEventRecord(stop,0);
214  hipEventSynchronize(stop);
215  hipEventElapsedTime(&time1, start, stop);
216 
217  hipEventRecord(start,0);
218 
219  for(int i = 0; i < 100; i++) {
220  CASE_KSTEP(LX);
221  }
222 
223  hipEventRecord(stop,0);
224  hipEventSynchronize(stop);
225  hipEventElapsedTime(&time2, start, stop);
226 
227  if(time1 < time2) {
228  retval = 1;
229  } else {
230  retval = 2;
231  }
232 
233  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
234  (retval > 1 ? "KSTEP" : "1D"));
235  log_message(neko_log_buf);
236  log_end_section();
237  return retval;
238 }
239 
240 template < const int LX >
241 int tune_padded(void *w, void *u, void *dx, void *dy, void *dz,
242  void *dxt, void *dyt, void *dzt, void *h1,
243  void *g11, void *g22, void *g33, void *g12,
244  void *g13, void *g23, int *nelv, int *lx) {
245  hipEvent_t start, stop;
246  float time1, time2;
247  int retval;
248 
249  const dim3 nthrds_1d(1024, 1, 1);
250  const dim3 nblcks_1d((*nelv), 1, 1);
251  const dim3 nthrds_kstep((*lx), (*lx), 1);
252  const dim3 nblcks_kstep((*nelv), 1, 1);
253 
254  char *env_value = NULL;
255  char neko_log_buf[80];
256 
257  env_value=getenv("NEKO_AUTOTUNE");
258 
259  sprintf(neko_log_buf, "Autotune Ax helm (lx: %d)", *lx);
260  log_section(neko_log_buf);
261 
262  if(env_value) {
263  if( !strcmp(env_value,"1D") ) {
264  CASE_1D(LX);
265  sprintf(neko_log_buf,"Set by env : 1 (1D)");
266  log_message(neko_log_buf);
267  log_end_section();
268  return 1;
269  } else if( !strcmp(env_value,"KSTEP") ) {
270  CASE_KSTEP(LX);
271  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
272  log_message(neko_log_buf);
273  log_end_section();
274  return 2;
275  } else {
276  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
277  log_error(neko_log_buf);
278  }
279  }
280 
281  hipEventCreate(&start);
282  hipEventCreate(&stop);
283 
284  hipEventRecord(start,0);
285 
286  for(int i = 0; i < 100; i++) {
287  CASE_1D(LX);
288  }
289 
290  hipEventRecord(stop, 0);
291  hipEventSynchronize(stop);
292  hipEventElapsedTime(&time1, start, stop);
293 
294  hipEventRecord(start, 0);
295 
296  for(int i = 0; i < 100; i++) {
297  CASE_KSTEP_PADDED(LX);
298  }
299 
300  hipEventRecord(stop, 0);
301  hipEventSynchronize(stop);
302  hipEventElapsedTime(&time2, start, stop);
303 
304  if(time1 < time2) {
305  retval=1;
306  } else {
307  retval=2;
308  }
309 
310  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
311  (retval > 1 ? "KSTEP" : "1D"));
312  log_message(neko_log_buf);
313  log_end_section();
314  return retval;
315 }
316 
#define CASE(LX)
#define CASE_KSTEP(LX)
int tune(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:164
#define CASE_PADDED(LX)
#define CASE_1D(LX)
#define CASE_KSTEP_PADDED(LX)
void hip_ax_helm(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:64
int tune_padded(void *w, void *u, void *dx, void *dy, void *dz, void *dxt, void *dyt, void *dzt, void *h1, void *g11, void *g22, void *g33, void *g12, void *g13, void *g23, int *nelv, int *lx)
Definition: ax_helm.hip:241
const int i
Definition: cdtp_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:115
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:114
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:113
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
Definition: conv1_kernel.h:138
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
Definition: conv1_kernel.h:136
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
Definition: conv1_kernel.h:137
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)