Neko  0.9.0
A portable framework for high-order spectral element flow simulations
opr_conv1.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "conv1_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int >
47 int tune_conv1(void *du, void *u,
48  void *vx, void *vy, void *vz,
49  void *dx, void *dy, void *dz,
50  void *drdx, void *dsdx, void *dtdx,
51  void *drdy, void *dsdy, void *dtdy,
52  void *drdz, void *dsdz, void *dtdz,
53  void *jacinv, int *nel, int *gdim, int *lx);
54 
55 extern "C" {
56 
60  void cuda_conv1(void *du, void *u,
61  void *vx, void *vy, void *vz,
62  void *dx, void *dy, void *dz,
63  void *drdx, void *dsdx, void *dtdx,
64  void *drdy, void *dsdy, void *dtdy,
65  void *drdz, void *dsdz, void *dtdz,
66  void *jacinv, int *nel, int *gdim, int *lx) {
67 
68  static int autotune[17] = { 0 };
69 
70  const dim3 nthrds_1d(1024, 1, 1);
71  const dim3 nthrds_kstep((*lx), (*lx), 1);
72  const dim3 nblcks((*nel), 1, 1);
73  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
74 
75 #define CASE_1D(LX) \
76  conv1_kernel_1d<real, LX, 1024> \
77  <<<nblcks, nthrds_1d, 0, stream>>> \
78  ((real *) du, (real *) u, \
79  (real *) vx, (real *) vy, (real *) vz, \
80  (real *) dx, (real *) dy, (real *) dz, \
81  (real *) drdx, (real *) dsdx, (real *) dtdx, \
82  (real *) drdy, (real *) dsdy, (real *) dtdy, \
83  (real *) drdz, (real *) dsdz, (real *) dtdz, \
84  (real *) jacinv); \
85  CUDA_CHECK(cudaGetLastError());
86 
87 #define CASE_KSTEP(LX) \
88  conv1_kernel_kstep<real, LX> \
89  <<<nblcks, nthrds_kstep, 0, stream>>> \
90  ((real *) du, (real *) u, \
91  (real *) vx, (real *) vy, (real *) vz, \
92  (real *) dx, (real *) dy, (real *) dz, \
93  (real *) drdx, (real *) dsdx, (real *) dtdx, \
94  (real *) drdy, (real *) dsdy, (real *) dtdy, \
95  (real *) drdz, (real *) dsdz, (real *) dtdz, \
96  (real *) jacinv); \
97  CUDA_CHECK(cudaGetLastError());
98 
99 #define CASE(LX) \
100  case LX: \
101  if(autotune[LX] == 0 ) { \
102  autotune[LX]=tune_conv1<LX>(du, u, \
103  vx, vy, vz, \
104  dx, dy, dz, \
105  drdx, dsdx, dtdx, \
106  drdy, dsdy, dtdy, \
107  drdz, dsdz, dtdz, \
108  jacinv, nel, gdim, lx); \
109  } else if (autotune[LX] == 1 ) { \
110  CASE_1D(LX); \
111  } else if (autotune[LX] == 2 ) { \
112  CASE_KSTEP(LX); \
113  } \
114  break
115 
116 #define CASE_LARGE(LX) \
117  case LX: \
118  CASE_KSTEP(LX); \
119  break
120 
121 
122  if ((*lx) < 11) {
123  switch(*lx) {
124  CASE(2);
125  CASE(3);
126  CASE(4);
127  CASE(5);
128  CASE(6);
129  CASE(7);
130  CASE(8);
131  CASE(9);
132  CASE(10);
133  default:
134  {
135  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
136  exit(1);
137  }
138  }
139  }
140  else {
141  switch(*lx) {
142  CASE_LARGE(11);
143  CASE_LARGE(12);
144  CASE_LARGE(13);
145  CASE_LARGE(14);
146  CASE_LARGE(15);
147  CASE_LARGE(16);
148  default:
149  {
150  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
151  exit(1);
152  }
153  }
154  }
155  }
156 }
157 
158 template < const int LX >
159 int tune_conv1(void *du, void *u,
160  void *vx, void *vy, void *vz,
161  void *dx, void *dy, void *dz,
162  void *drdx, void *dsdx, void *dtdx,
163  void *drdy, void *dsdy, void *dtdy,
164  void *drdz, void *dsdz, void *dtdz,
165  void *jacinv, int *nel, int *gdim, int *lx) {
166  cudaEvent_t start,stop;
167  float time1,time2;
168  int retval;
169 
170  const dim3 nthrds_1d(1024, 1, 1);
171  const dim3 nthrds_kstep((*lx), (*lx), 1);
172  const dim3 nblcks((*nel), 1, 1);
173  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
174 
175  char *env_value = NULL;
176  char neko_log_buf[80];
177 
178  env_value=getenv("NEKO_AUTOTUNE");
179 
180  sprintf(neko_log_buf, "Autotune conv1 (lx: %d)", *lx);
181  log_section(neko_log_buf);
182 
183  if(env_value) {
184  if( !strcmp(env_value,"1D") ) {
185  CASE_1D(LX);
186  sprintf(neko_log_buf,"Set by env : 1 (1D)");
187  log_message(neko_log_buf);
188  log_end_section();
189  return 1;
190  } else if( !strcmp(env_value,"KSTEP") ) {
191  CASE_KSTEP(LX);
192  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
193  log_message(neko_log_buf);
194  log_end_section();
195  return 2;
196  } else {
197  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
198  log_error(neko_log_buf);
199  }
200  }
201 
202  cudaEventCreate(&start);
203  cudaEventCreate(&stop);
204 
205  cudaEventRecord(start,0);
206 
207  for(int i = 0; i < 100; i++) {
208  CASE_1D(LX);
209  }
210 
211  cudaEventRecord(stop,0);
212  cudaEventSynchronize(stop);
213  cudaEventElapsedTime(&time1, start, stop);
214 
215  cudaEventRecord(start,0);
216 
217  for(int i = 0; i < 100; i++) {
218  CASE_KSTEP(LX);
219  }
220 
221  cudaEventRecord(stop,0);
222  cudaEventSynchronize(stop);
223  cudaEventElapsedTime(&time2, start, stop);
224 
225  if(time1 < time2) {
226  retval = 1;
227  } else {
228  retval = 2;
229  }
230 
231  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
232  (retval > 1 ? "KSTEP" : "1D"));
233  log_message(neko_log_buf);
234  log_end_section();
235  return retval;
236 }
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ vz
Definition: conv1_kernel.h:135
__global__ void const T *__restrict__ const T *__restrict__ vx
Definition: conv1_kernel.h:133
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ vy
Definition: conv1_kernel.h:134
void * glb_cmd_queue
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)
#define CASE(LX)
void cuda_conv1(void *du, void *u, void *vx, void *vy, void *vz, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *gdim, int *lx)
Definition: opr_conv1.cu:60
#define CASE_KSTEP(LX)
#define CASE_LARGE(LX)
#define CASE_1D(LX)
int tune_conv1(void *du, void *u, void *vx, void *vy, void *vz, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *gdim, int *lx)
Definition: opr_conv1.cu:159