Neko  0.9.99
A portable framework for high-order spectral element flow simulations
opr_cdtp.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <string.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include "cdtp_kernel.h"
39 #include <device/device_config.h>
40 #include <device/cuda/check.h>
41 
42 extern "C" {
43  #include <common/neko_log.h>
44 }
45 
46 template < const int >
47 int tune_cdtp(void *dtx, void *x,
48  void *dr, void *ds, void *dt,
49  void *dxt, void *dyt, void *dzt,
50  void *w3, int *nel, int *lx);
51 
52 extern "C" {
53 
57  void cuda_cdtp(void *dtx, void *x,
58  void *dr, void *ds, void *dt,
59  void *dxt, void *dyt, void *dzt,
60  void *w3, int *nel, int *lx) {
61 
62  static int autotune[17] = { 0 };
63 
64  const dim3 nthrds_1d(1024, 1, 1);
65  const dim3 nthrds_kstep((*lx), (*lx), 1);
66  const dim3 nblcks((*nel), 1, 1);
67  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
68 
69 #define CASE_1D(LX) \
70  cdtp_kernel_1d<real, LX, 1024> \
71  <<<nblcks, nthrds_1d, 0, stream>>>((real *) dtx, (real *) x, \
72  (real *) dr, (real *) ds, (real *) dt, \
73  (real *) dxt, (real *) dyt, (real *) dzt, \
74  (real *) w3); \
75  CUDA_CHECK(cudaGetLastError());
76 
77 #define CASE_KSTEP(LX) \
78  cdtp_kernel_kstep<real, LX> \
79  <<<nblcks, nthrds_kstep, 0, stream>>>((real *) dtx, (real *) x, \
80  (real *) dr, (real *) ds, (real *) dt, \
81  (real *) dxt, (real *) dyt, (real *) dzt, \
82  (real *) w3); \
83  CUDA_CHECK(cudaGetLastError());
84 
85 #define CASE(LX) \
86  case LX: \
87  if(autotune[LX] == 0 ) { \
88  autotune[LX]=tune_cdtp<LX>(dtx, x, \
89  dr, ds, dt, \
90  dxt, dyt, dzt, \
91  w3, nel, lx); \
92  } else if (autotune[LX] == 1 ) { \
93  CASE_1D(LX); \
94  } else if (autotune[LX] == 2 ) { \
95  CASE_KSTEP(LX); \
96  } \
97  break
98 
99 #define CASE_LARGE(LX) \
100  case LX: \
101  CASE_KSTEP(LX); \
102  break
103 
104 
105  if ((*lx) < 13) {
106  switch(*lx) {
107  CASE(2);
108  CASE(3);
109  CASE(4);
110  CASE(5);
111  CASE(6);
112  CASE(7);
113  CASE(8);
114  CASE(9);
115  CASE(10);
116  CASE(11);
117  CASE(12);
118  default:
119  {
120  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
121  exit(1);
122  }
123  }
124  }
125  else {
126  switch(*lx) {
127  CASE_LARGE(13);
128  CASE_LARGE(14);
129  CASE_LARGE(15);
130  CASE_LARGE(16);
131  default:
132  {
133  fprintf(stderr, __FILE__ ": size not supported: %d\n", *lx);
134  exit(1);
135  }
136  }
137  }
138  }
139 }
140 
141 template < const int LX >
142 int tune_cdtp(void *dtx, void *x,
143  void *dr, void *ds, void *dt,
144  void *dxt, void *dyt, void *dzt,
145  void *w3, int *nel, int *lx) {
146  cudaEvent_t start,stop;
147  float time1,time2;
148  int retval;
149 
150  const dim3 nthrds_1d(1024, 1, 1);
151  const dim3 nthrds_kstep((*lx), (*lx), 1);
152  const dim3 nblcks((*nel), 1, 1);
153  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
154 
155  char *env_value = NULL;
156  char neko_log_buf[80];
157 
158  env_value=getenv("NEKO_AUTOTUNE");
159 
160  sprintf(neko_log_buf, "Autotune cdtp (lx: %d)", *lx);
161  log_section(neko_log_buf);
162 
163  if(env_value) {
164  if( !strcmp(env_value,"1D") ) {
165  CASE_1D(LX);
166  sprintf(neko_log_buf,"Set by env : 1 (1D)");
167  log_message(neko_log_buf);
168  log_end_section();
169  return 1;
170  } else if( !strcmp(env_value,"KSTEP") ) {
171  CASE_KSTEP(LX);
172  sprintf(neko_log_buf,"Set by env : 2 (KSTEP)");
173  log_message(neko_log_buf);
174  log_end_section();
175  return 2;
176  } else {
177  sprintf(neko_log_buf, "Invalid value set for NEKO_AUTOTUNE");
178  log_error(neko_log_buf);
179  }
180  }
181 
182  cudaEventCreate(&start);
183  cudaEventCreate(&stop);
184 
185  cudaEventRecord(start,0);
186 
187  for(int i = 0; i < 100; i++) {
188  CASE_1D(LX);
189  }
190 
191  cudaEventRecord(stop,0);
192  cudaEventSynchronize(stop);
193  cudaEventElapsedTime(&time1, start, stop);
194 
195  cudaEventRecord(start,0);
196 
197  for(int i = 0; i < 100; i++) {
198  CASE_KSTEP(LX);
199  }
200 
201  cudaEventRecord(stop,0);
202  cudaEventSynchronize(stop);
203  cudaEventElapsedTime(&time2, start, stop);
204 
205  if(time1 < time2) {
206  retval = 1;
207  } else {
208  retval = 2;
209  }
210 
211  sprintf(neko_log_buf, "Chose : %d (%s)", retval,
212  (retval > 1 ? "KSTEP" : "1D"));
213  log_message(neko_log_buf);
214  log_end_section();
215  return retval;
216 }
const int i
__global__ void const T *__restrict__ const T *__restrict__ dr
Definition: cdtp_kernel.h:107
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
Definition: cdtp_kernel.h:108
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
Definition: cdtp_kernel.h:109
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
Definition: cdtp_kernel.h:112
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
Definition: cdtp_kernel.h:111
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
Definition: cdtp_kernel.h:110
void * glb_cmd_queue
void log_error(char *msg)
void log_message(char *msg)
void log_end_section()
void log_section(char *msg)
int tune_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
Definition: opr_cdtp.cu:142
#define CASE(LX)
#define CASE_KSTEP(LX)
#define CASE_LARGE(LX)
#define CASE_1D(LX)
void cuda_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
Definition: opr_cdtp.cu:57