1#ifndef __MATH_AX_HELM_KERNEL_H__ 
    2#define __MATH_AX_HELM_KERNEL_H__ 
   41template< 
typename T, const 
int LX, const 
int CHUNKS >
 
  115      for (
int l = 0; l<
LX; l++){
 
  137      for (
int l = 0; l<
LX; l++){
 
 
  148template< 
typename T, const 
int LX >
 
  194  for (
int k = 0; 
k < 
LX; ++
k){
 
  205    for (
int l = 0; l < 
LX; l++){
 
  213    for (
int l = 0; l < 
LX; l++){
 
  234    for (
int l = 0; l < 
LX; l++){
 
  242  for (
int k = 0; 
k < 
LX; ++
k){
 
  252template< 
typename T, const 
int LX >
 
  291  for(
int k = 0; 
k < 
LX; ++
k){
 
  299  for (
int k = 0; 
k < 
LX; ++
k){
 
  310    for (
int l = 0; l < 
LX; l++){
 
  318    for (
int l = 0; l < 
LX; l++){
 
  339    for (
int l = 0; l < 
LX; l++){
 
  347  for (
int k = 0; 
k < 
LX; ++
k){
 
  356template< 
typename T, const 
int LX >
 
  414  for(
int k = 0; 
k < 
LX; ++
k){
 
  428  for (
int k = 0; 
k < 
LX; ++
k){
 
  443    for (
int l = 0; l < 
LX; l++){
 
  459    for (
int l = 0; l < 
LX; l++){
 
  515    for (
int l = 0; l < 
LX; l++){
 
  533  for (
int k = 0; 
k < 
LX; ++
k){
 
  540template< 
typename T, const 
int LX >
 
  599  for(
int k = 0; 
k < 
LX; ++
k){
 
  613  for (
int k = 0; 
k < 
LX; ++
k){
 
  628    for (
int l = 0; l < 
LX; l++){
 
  644    for (
int l = 0; l < 
LX; l++){
 
  700    for (
int l = 0; l < 
LX; l++){
 
  718  for (
int k = 0; 
k < 
LX; ++
k){
 
  725template< 
typename T >
 
  739  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
__shared__ T shdy[LX *LX]
 
__global__ void T *__restrict__ T *__restrict__ aw
 
__shared__ T shdz[LX *LX]
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
 
__shared__ T shvs[LX *LX]
 
__shared__ T shws[LX *LX]
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dx
 
__global__ void T *__restrict__ av
 
__shared__ T shwr[LX *LX]
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
 
__shared__ T shus[LX *LX]
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dz
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ h1
 
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dy
 
__shared__ T shvr[LX *LX]
 
__shared__ T shur[LX *LX]
 
__global__ void ax_helm_kernel_vector_kstep(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
 
__global__ void ax_helm_kernel_1d(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ dxt, const T *__restrict__ dyt, const T *__restrict__ dzt, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
 
__global__ void ax_helm_kernel_kstep(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
 
__global__ void ax_helm_kernel_kstep_padded(T *__restrict__ w, const T *__restrict__ u, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
 
__global__ void ax_helm_kernel_vector_kstep_padded(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ dx, const T *__restrict__ dy, const T *__restrict__ dz, const T *__restrict__ h1, const T *__restrict__ g11, const T *__restrict__ g22, const T *__restrict__ g33, const T *__restrict__ g12, const T *__restrict__ g13, const T *__restrict__ g23)
 
__global__ void ax_helm_kernel_vector_part2(T *__restrict__ au, T *__restrict__ av, T *__restrict__ aw, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ h2, const T *__restrict__ B, const int n)
 
__shared__ T shdyt[LX *LX]
 
__shared__ T shdzt[LX *LX]
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dzt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dyt
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ dxt
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g23
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g22
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g13
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g12
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g33
 
__global__ void __launch_bounds__(LX *LX, 3) ax_helm_kernel_kstep(T *__restrict__ w
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ g11