48              void *
dr, 
void *
ds, 
void *
dt,
 
   50              void *
w3, 
int *nel, 
int *lx);
 
   58                 void *
dr, 
void *
ds, 
void *
dt,
 
   60                 void *
w3, 
int *nel, 
int *lx) {
 
   70    cdtp_kernel_1d<real, LX, 1024>                                              \ 
   71      <<<nblcks, nthrds_1d, 0, stream>>>((real *) dtx, (real *) x,              \ 
   72                              (real *) dr, (real *) ds, (real *) dt,            \ 
   73                              (real *) dxt, (real *) dyt, (real *) dzt,         \ 
   75    CUDA_CHECK(cudaGetLastError()); 
   77#define CASE_KSTEP(LX)                                                          \ 
   78    cdtp_kernel_kstep<real, LX>                                                 \ 
   79      <<<nblcks, nthrds_kstep, 0, stream>>>((real *) dtx, (real *) x,           \ 
   80                                 (real *) dr, (real *) ds, (real *) dt,         \ 
   81                                 (real *) dxt, (real *) dyt, (real *) dzt,      \ 
   83    CUDA_CHECK(cudaGetLastError()); 
   87      if(autotune[LX] == 0 ) {                                                  \ 
   88        autotune[LX]=tune_cdtp<LX>(dtx, x,                                      \ 
   92      } else if (autotune[LX] == 1 ) {                                          \ 
   94      } else if (autotune[LX] == 2 ) {                                          \ 
   99#define CASE_LARGE(LX)                                                          \ 
 
  141template < const 
int LX >
 
  143              void *
dr, 
void *
ds, 
void *
dt,
 
  145              void *
w3, 
int *nel, 
int *lx) {
 
  187  for(
int i = 0; 
i < 100; 
i++) {
 
  197  for(
int i = 0; 
i < 100; 
i++) {
 
  212          (
retval > 1 ? 
"KSTEP" : 
"1D"));
 
 
__global__ void const T *__restrict__ const T *__restrict__ dr
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ ds
 
__global__ void const T *__restrict__ x
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
void log_error(char *msg)
 
void log_message(char *msg)
 
void log_section(char *msg)
 
int tune_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)
 
void cuda_cdtp(void *dtx, void *x, void *dr, void *ds, void *dt, void *dxt, void *dyt, void *dzt, void *w3, int *nel, int *lx)