Neko  0.9.99
A portable framework for high-order spectral element flow simulations
opr_lambda2.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "lambda2_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int >
47 int tune_lambda2(void *ux, void *uy, void *uz, void *u,
48  void *dx, void *dy, void *dz,
49  void *drdx, void *dsdx, void *dtdx,
50  void *drdy, void *dsdy, void *dtdy,
51  void *drdz, void *dsdz, void *dtdz,
52  void *jacinv, int *nel, int *lx);
53 
54 extern "C" {
55 
59  void cuda_lambda2(void *lambda2, void *u, void *v, void *w,
60  void *dx, void *dy, void *dz,
61  void *drdx, void *dsdx, void *dtdx,
62  void *drdy, void *dsdy, void *dtdy,
63  void *drdz, void *dsdz, void *dtdz,
64  void *jacinv, int *nel, int *lx) {
65 
66  static int autotune[17] = { 0 };
67 
68  const dim3 nthrds_1d(1024, 1, 1);
69  const dim3 nthrds_kstep((*lx), (*lx), 1);
70  const dim3 nblcks((*nel), 1, 1);
71  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
72 
73 #define CASE_1D(LX) \
74  lambda2_kernel_1d<real, LX, 1024> \
75  <<<nblcks, nthrds_1d, 0, stream>>> \
76  ((real *) lambda2, (real *) u, (real *) v, (real *) w, \
77  (real *) dx, (real *) dy, (real *) dz, \
78  (real *) drdx, (real *) dsdx, (real *) dtdx, \
79  (real *) drdy, (real *) dsdy, (real *) dtdy, \
80  (real *) drdz, (real *) dsdz, (real *) dtdz, \
81  (real *) jacinv); \
82  CUDA_CHECK(cudaGetLastError());
83 
84 
85 #define CASE_KSTEP(LX) \
86  lambda2_kernel_kstep<real, LX> <<<nblcks, nthrds_kstep, 0, stream>>> \
87  ((real *) lambda2, (real *) u, (real *) v, (real *) w, \
88  (real *) dx, (real *) dy, (real *) dz, \
89  (real *) drdx, (real *) dsdx, (real *) dtdx, \
90  (real *) drdy, (real *) dsdy, (real *) dtdy, \
91  (real *) drdz, (real *) dsdz, (real *) dtdz, \
92  (real *) jacinv); \
93  CUDA_CHECK(cudaGetLastError());
94 
95 #define CASE(LX) \
96  case LX: \
97  if(autotune[LX] == 0 ) { \
98  autotune[LX]=tune_lambda2<LX>(lambda2, u, v, w, \
99  dx, dy, dz, \
100  drdx, dsdx, dtdx, \
101  drdy, dsdy, dtdy, \
102  drdz, dsdz, dtdz, \
103  jacinv, nel, lx); \
104  } else if (autotune[LX] == 1 ) { \
105  CASE_1D(LX); \
106  } else if (autotune[LX] == 2 ) { \
107  CASE_KSTEP(LX); \
108  } \
109  break
110 
111  switch(*lx) {
112  CASE(2);
113  CASE(3);
114  CASE(4);
115  CASE(5);
116  CASE(6);
117  CASE(7);
118  CASE(8);
119  CASE(9);
120  CASE(10);
121  CASE(11);
122  CASE(12);
123  default:
124  {
125  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
126  exit(1);
127  }
128  }
129  }
130 }
131 
132 template < const int LX >
133 int tune_lambda2(void *lambda2, void *u, void *v, void *w,
134  void *dx, void *dy, void *dz,
135  void *drdx, void *dsdx, void *dtdx,
136  void *drdy, void *dsdy, void *dtdy,
137  void *drdz, void *dsdz, void *dtdz,
138  void *jacinv, int *nel, int *lx) {
139  cudaEvent_t start,stop;
140  float time1,time2;
141  int retval;
142 
143  const dim3 nthrds_1d(1024, 1, 1);
144  const dim3 nthrds_kstep((*lx), (*lx), 1);
145  const dim3 nblcks((*nel), 1, 1);
146  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
147 
148  char *env_value = NULL;
149  char neko_log_buf[80];
150 
151  env_value=getenv("NEKO_AUTOTUNE");
152 
153  sprintf(neko_log_buf, "Autotune lambda2 (lx: %d)", *lx);
154  log_section(neko_log_buf);
155 
156  if(env_value) {
157  if( !strcmp(env_value,"1D") ) {
158  CASE_1D(LX);
159  sprintf(neko_log_buf,"Set by env : 1 (1D)");
160  log_message(neko_log_buf);
161  log_end_section();
162  return 1;
163  } else if( !strcmp(env_value,"KSTEP") ) {
164  CASE_KSTEP(LX);
165  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
166  log_message(neko_log_buf);
167  log_end_section();
168  return 2;
169  } else {
170  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
171  log_error(neko_log_buf);
172  }
173  }
174 
175  cudaEventCreate(&start);
176  cudaEventCreate(&stop);
177 
178  cudaEventRecord(start,0);
179 
180  for(int i = 0; i < 100; i++) {
181  CASE_1D(LX);
182  }
183 
184  cudaEventRecord(stop,0);
185  cudaEventSynchronize(stop);
186  cudaEventElapsedTime(&time1, start, stop);
187 
188  cudaEventRecord(start,0);
189 
190  for(int i = 0; i < 100; i++) {
191  CASE_KSTEP(LX);
192  }
193 
194  cudaEventRecord(stop,0);
195  cudaEventSynchronize(stop);
196  cudaEventElapsedTime(&time2, start, stop);
197 
198  if(time1 < time2) {
199  retval = 1;
200  } else {
201  retval = 2;
202  }
203 
204  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
205  (retval > 1 ? "KSTEP" : "1D"));
206  log_message(neko_log_buf);
207  log_end_section();
208  return retval;
209 }
210 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ drdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dsdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dtdx
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ jacinv
__global__ void T *__restrict__ uy
__global__ void T *__restrict__ T *__restrict__ uz
void * glb_cmd_queue
A simulation component that computes lambda2 The values are stored in the field registry under the na...
Definition: lambda2.f90:37
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)
#define CASE(LX)
#define CASE_KSTEP(LX)
#define CASE_1D(LX)
int tune_lambda2(void *ux, void *uy, void *uz, void *u, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *lx)
Definition: opr_lambda2.cu:133
void cuda_lambda2(void *lambda2, void *u, void *v, void *w, void *dx, void *dy, void *dz, void *drdx, void *dsdx, void *dtdx, void *drdy, void *dsdy, void *dtdy, void *drdz, void *dsdz, void *dtdz, void *jacinv, int *nel, int *lx)
Definition: opr_lambda2.cu:59