__global__ void scalar_makebdf_kernel(const T *__restrict__ s_lag, const T *__restrict__ s_laglag, T *__restrict__ fs, const T *__restrict__ s, const T *__restrict__ B, const T rho, const T dt, const T bd2, const T bd3, const T bd4, const int nbd, const int n)
__global__ void makebdf_kernel(const T *__restrict__ ulag1, const T *__restrict__ ulag2, const T *__restrict__ vlag1, const T *__restrict__ vlag2, const T *__restrict__ wlag1, const T *__restrict__ wlag2, T *__restrict__ bfx, T *__restrict__ bfy, T *__restrict__ bfz, const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w, const T *__restrict__ B, const T rho, const T dt, const T bd2, const T bd3, const T bd4, const int nbd, const int n)