| 
| subroutine, public  | device_math::device_copy (a_d, b_d, n, strm) | 
|   | Copy a vector \( a = b \).  
  | 
|   | 
| subroutine, public  | device_math::device_masked_copy_0 (a_d, b_d, mask_d, n, n_mask, strm) | 
|   | Copy a masked vector \( a(mask) = b(mask) \).  
  | 
|   | 
| subroutine, public  | device_math::device_masked_gather_copy_0 (a_d, b_d, mask_d, n, n_mask, strm) | 
|   | Gather a masked vector \( a(i) = b(mask(i)) \).  
  | 
|   | 
| subroutine, public  | device_math::device_masked_scatter_copy_0 (a_d, b_d, mask_d, n, n_mask, strm) | 
|   | Scatter a masked vector \( a((mask(i)) = b(i) \).  
  | 
|   | 
| subroutine, public  | device_math::device_masked_atomic_reduction_0 (a_d, b_d, mask_d, n, n_mask, strm) | 
|   | 
| subroutine, public  | device_math::device_cfill_mask (a_d, c, n, mask_d, n_mask, strm) | 
|   | Fill a constant to a masked vector. \( a_i = c, for i in mask \).  
  | 
|   | 
| subroutine, public  | device_math::device_rzero (a_d, n, strm) | 
|   | Zero a real vector.  
  | 
|   | 
| subroutine, public  | device_math::device_rone (a_d, n, strm) | 
|   | Set all elements to one.  
  | 
|   | 
| subroutine, public  | device_math::device_cmult (a_d, c, n, strm) | 
|   | Multiplication by constant c \( a = c \cdot a \).  
  | 
|   | 
| subroutine, public  | device_math::device_cmult2 (a_d, b_d, c, n, strm) | 
|   | Multiplication by constant c \( a = c \cdot b \).  
  | 
|   | 
| subroutine, public  | device_math::device_cdiv (a_d, c, n, strm) | 
|   | Division of constant c by array \( a = c / a \).  
  | 
|   | 
| subroutine, public  | device_math::device_cdiv2 (a_d, b_d, c, n, strm) | 
|   | Division of constant c by array \( a = c / b \).  
  | 
|   | 
| subroutine  | device_math::device_radd (a_d, c, n, strm) | 
|   | Add a scalar to vector \( a = a + s \).  
  | 
|   | 
| subroutine, public  | device_math::device_cadd2 (a_d, b_d, c, n, strm) | 
|   | Add a scalar to vector \( a = b + s \).  
  | 
|   | 
| subroutine, public  | device_math::device_cfill (a_d, c, n, strm) | 
|   | Set all elements to a constant c \( a = c \).  
  | 
|   | 
| subroutine, public  | device_math::device_add2 (a_d, b_d, n, strm) | 
|   | Vector addition \( a = a + b \).  
  | 
|   | 
| subroutine, public  | device_math::device_add4 (a_d, b_d, c_d, d_d, n, strm) | 
|   | 
| subroutine, public  | device_math::device_add2s1 (a_d, b_d, c1, n, strm) | 
|   | 
| subroutine, public  | device_math::device_add2s2 (a_d, b_d, c1, n, strm) | 
|   | Vector addition with scalar multiplication \( a = c_1 a + b \) (multiplication on first argument)  
  | 
|   | 
| subroutine, public  | device_math::device_addsqr2s2 (a_d, b_d, c1, n, strm) | 
|   | Returns \( a = a + c1 * (b * b )\).  
  | 
|   | 
| subroutine, public  | device_math::device_add3 (a_d, b_d, c_d, n, strm) | 
|   | Vector addition \( a = b + c \).  
  | 
|   | 
| subroutine, public  | device_math::device_add3s2 (a_d, b_d, c_d, c1, c2, n, strm) | 
|   | Returns \( a = c1 * b + c2 * c \).  
  | 
|   | 
| subroutine, public  | device_math::device_add4s3 (a_d, b_d, c_d, d_d, c1, c2, c3, n, strm) | 
|   | Returns \( a = c1 * b + c2 * c + c3 * d\).  
  | 
|   | 
| subroutine, public  | device_math::device_add5s4 (a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm) | 
|   | Returns \( a = a + c1 * b + c2 * c + c3 * d + c4 * e\).  
  | 
|   | 
| subroutine, public  | device_math::device_invcol1 (a_d, n, strm) | 
|   | Invert a vector \( a = 1 / a \).  
  | 
|   | 
| subroutine, public  | device_math::device_invcol2 (a_d, b_d, n, strm) | 
|   | Vector division \( a = a / b \).  
  | 
|   | 
| subroutine, public  | device_math::device_invcol3 (a_d, b_d, c_d, n, strm) | 
|   | Vector division \( a = b / c \).  
  | 
|   | 
| subroutine, public  | device_math::device_col2 (a_d, b_d, n, strm) | 
|   | Vector multiplication \( a = a \cdot b \).  
  | 
|   | 
| subroutine, public  | device_math::device_col3 (a_d, b_d, c_d, n, strm) | 
|   | Vector multiplication with 3 vectors \( a =  b \cdot c \).  
  | 
|   | 
| subroutine, public  | device_math::device_subcol3 (a_d, b_d, c_d, n, strm) | 
|   | Returns \( a = a - b*c \).  
  | 
|   | 
| subroutine, public  | device_math::device_sub2 (a_d, b_d, n, strm) | 
|   | Vector substraction \( a = a - b \).  
  | 
|   | 
| subroutine, public  | device_math::device_sub3 (a_d, b_d, c_d, n, strm) | 
|   | Vector subtraction \( a = b - c \).  
  | 
|   | 
| subroutine, public  | device_math::device_addcol3 (a_d, b_d, c_d, n, strm) | 
|   | Returns \( a = a + b*c \).  
  | 
|   | 
| subroutine, public  | device_math::device_addcol4 (a_d, b_d, c_d, d_d, n, strm) | 
|   | Returns \( a = a + b*c*d \).  
  | 
|   | 
| subroutine, public  | device_math::device_addcol3s2 (a_d, b_d, c_d, s, n, strm) | 
|   | Returns \( a = a + s(b*c) \).  
  | 
|   | 
| subroutine, public  | device_math::device_vdot3 (dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm) | 
|   | Compute a dot product \( dot = u \cdot v \) (3-d version) assuming vector components \( u = (u_1, u_2, u_3) \) etc.  
  | 
|   | 
| subroutine, public  | device_math::device_vcross (u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm) | 
|   | Compute a cross product \( u = v \times w \) (3-d version) assuming vector components \( u = (u_1, u_2, u_3) \) etc.  
  | 
|   | 
| real(kind=rp) function, public  | device_math::device_vlsc3 (u_d, v_d, w_d, n, strm) | 
|   | Compute multiplication sum \( dot = u \cdot v \cdot w \).  
  | 
|   | 
| real(kind=rp) function, public  | device_math::device_glsc3 (a_d, b_d, c_d, n, strm) | 
|   | Weighted inner product \( a^T b c \).  
  | 
|   | 
| subroutine, public  | device_math::device_glsc3_many (h, w_d, v_d_d, mult_d, j, n, strm) | 
|   | 
| subroutine, public  | device_math::device_add2s2_many (y_d, x_d_d, a_d, j, n, strm) | 
|   | 
| real(kind=rp) function, public  | device_math::device_glsc2 (a_d, b_d, n, strm) | 
|   | Weighted inner product \( a^T b \).  
  | 
|   | 
| real(kind=rp) function, public  | device_math::device_glsubnorm (a_d, b_d, n, strm) | 
|   | Returns the norm of the difference of two vectors \( \sqrt{(a-b)^T (a-b)} \).  
  | 
|   | 
| real(kind=rp) function, public  | device_math::device_glsum (a_d, n, strm) | 
|   | Sum a vector of length n.  
  | 
|   | 
| subroutine, public  | device_math::device_absval (a_d, n, strm) | 
|   | 
| subroutine, public  | device_math::device_pwmax2 (a_d, b_d, n, strm) | 
|   | Compute the point-wise maximum of two vectors \( a_i = \max(a_i, b_i) \).  
  | 
|   | 
| subroutine, public  | device_math::device_pwmax3 (a_d, b_d, c_d, n, strm) | 
|   | Compute the point-wise maximum of two vectors \( a_i = \max(b_i, c_i) \).  
  | 
|   | 
| subroutine, public  | device_math::device_cpwmax2 (a_d, c, n, strm) | 
|   | Compute the point-wise maximum of a vector and a scalar \( a_i = \max(a_i, c) \).  
  | 
|   | 
| subroutine, public  | device_math::device_cpwmax3 (a_d, b_d, c, n, strm) | 
|   | Compute the point-wise maximum of a vector and a scalar \( a_i = \max(b_i, c) \).  
  | 
|   | 
| subroutine, public  | device_math::device_pwmin2 (a_d, b_d, n, strm) | 
|   | Compute the point-wise minimum of two vectors \( a_i = \min(a_i, b_i) \).  
  | 
|   | 
| subroutine, public  | device_math::device_pwmin3 (a_d, b_d, c_d, n, strm) | 
|   | Compute the point-wise minimum of two vectors \( a_i = \min(b_i, c_i) \).  
  | 
|   | 
| subroutine, public  | device_math::device_cpwmin2 (a_d, c, n, strm) | 
|   | Compute the point-wise minimum of a vector and a scalar \( a_i = \min(a_i, c) \).  
  | 
|   | 
| subroutine, public  | device_math::device_cpwmin3 (a_d, b_d, c, n, strm) | 
|   | Compute the point-wise minimum of a vector and a scalar \( a_i = \min(b_i, c) \).  
  | 
|   | 
| subroutine  | device_math::device_iadd (a_d, c, n, strm) | 
|   | Add an integer scalar to vector \( a = a + s \).  
  | 
|   |