__global__ void richardson_compute(const T *__restrict__ u_d, const T *__restrict__ v_d, const T *__restrict__ w_d, const T *__restrict__ temp_d, const T *__restrict__ h_d, const T *__restrict__ n_x_d, const T *__restrict__ n_y_d, const T *__restrict__ n_z_d, const int *__restrict__ ind_r_d, const int *__restrict__ ind_s_d, const int *__restrict__ ind_t_d, const int *__restrict__ ind_e_d, T *__restrict__ tau_x_d, T *__restrict__ tau_y_d, T *__restrict__ tau_z_d, int n_nodes, int lx, T kappa, const T *__restrict__ mu_w_d, const T *__restrict__ rho_w_d, T g1, T g2, T g3, T Pr, T z0, T z0h_in, T bc_value, T *__restrict__ Ri_b_diagn, T *__restrict__ L_ob_diagn, T *__restrict__ utau_diagn, T *__restrict__ magu_diagn, T *__restrict__ ti_diagn, T *__restrict__ ts_diagn, T *__restrict__ q_diagn, const int *__restrict__ h_x_idx, const int *__restrict__ h_y_idx, const int *__restrict__ h_z_idx)
__device__ T heat_flux_stable(T ti, T ts, T Ri_b, T h, T z0h, T utau, T kappa, T Pr)