__global__ void compute_cheap_dist_kernel(T *__restrict__ d, const T *__restrict__ x, const T *__restrict__ y, const T *__restrict__ z, const int lx, const int ly, const int lz, const int nel, const int local_iters, int *__restrict__ nchange)
__global__ void ale_add_kinematics_kernel(const int n, T *__restrict__ wx, T *__restrict__ wy, T *__restrict__ wz, const T *__restrict__ x_ref, const T *__restrict__ y_ref, const T *__restrict__ z_ref, const T *__restrict__ phi, const T *__restrict__ x, const T *__restrict__ y, const T *__restrict__ z, const kinematics_params_t kin_params)