d8/d8f/device__math_8F90_source.html

! Copyright (c) 2021-2024, The Neko Authors

! All rights reserved.

!

! Redistribution and use in source and binary forms, with or without

! modification, are permitted provided that the following conditions

! are met:

!

!   * Redistributions of source code must retain the above copyright

!     notice, this list of conditions and the following disclaimer.

!

!   * Redistributions in binary form must reproduce the above

!     copyright notice, this list of conditions and the following

!     disclaimer in the documentation and/or other materials provided

!     with the distribution.

!

!   * Neither the name of the authors nor the names of its

!     contributors may be used to endorse or promote products derived

!     from this software without specific prior written permission.

!

! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

! POSSIBILITY OF SUCH DAMAGE.

!

module device_math

  use, intrinsic :: iso_c_binding, only: c_ptr, c_int

  use num_types, only: rp, c_rp

  use utils, only: neko_error

  use comm, only: neko_comm, pe_size, mpi_real_precision

  use mpi_f08, only: mpi_sum, mpi_in_place, mpi_allreduce


  ! ========================================================================== !

  ! Device math interfaces


  use hip_math

  use cuda_math

  use opencl_math


  implicit none

  private


  interface device_pwmax

     module procedure device_pwmax_vec2, device_pwmax_vec3, &

          device_pwmax_sca2, device_pwmax_sca3

  interface device_pwmax …

  end interface device_pwmax


  interface device_pwmin

     module procedure device_pwmin_vec2, device_pwmin_vec3, &

          device_pwmin_sca2, device_pwmin_sca3

  interface device_pwmin …

  end interface device_pwmin


  public :: device_copy, device_rzero, device_rone, device_cmult, &

       device_cmult2, device_cadd, device_cadd2, device_cfill, device_add2, &

       device_add3, device_add4, device_add2s1, device_add2s2, &

       device_addsqr2s2, device_add3s2, device_invcol1, device_invcol2, &

       device_col2, device_col3, device_subcol3, device_sub2, device_sub3, &

       device_addcol3, device_addcol4, device_vdot3, device_vlsc3, &

       device_glsc3, device_glsc3_many, device_add2s2_many, device_glsc2, &

       device_glsum, device_masked_copy, device_cfill_mask, &

       device_masked_red_copy, device_vcross, device_absval, &

       device_pwmax, device_pwmin, device_masked_atomic_reduction


contains


  subroutine device_copy(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n

#if HAVE_HIP

    call hip_copy(a_d, b_d, n)

#elif HAVE_CUDA

    call cuda_copy(a_d, b_d, n)

#elif HAVE_OPENCL

    call opencl_copy(a_d, b_d, n)

#else

    call neko_error('no device backend configured')

#endif

  subroutine device_copy(a_d, b_d, n) …

  end subroutine device_copy


  subroutine device_masked_copy(a_d, b_d, mask_d, n, m)

    type(c_ptr) :: a_d, b_d, mask_d

    integer :: n, m

#if HAVE_HIP

    call hip_masked_copy(a_d, b_d, mask_d, n, m)

#elif HAVE_CUDA

    call cuda_masked_copy(a_d, b_d, mask_d, n, m)

#elif HAVE_OPENCL

    call opencl_masked_copy(a_d, b_d, mask_d, n, m)

#else

    call neko_error('no device backend configured')

#endif

  subroutine device_masked_copy(a_d, b_d, mask_d, n, m) …

  end subroutine device_masked_copy


  subroutine device_masked_red_copy(a_d, b_d, mask_d, n, m)

    type(c_ptr) :: a_d, b_d, mask_d

    integer :: n, m

#if HAVE_HIP

    call hip_masked_red_copy(a_d, b_d, mask_d, n, m)

#elif HAVE_CUDA

    call cuda_masked_red_copy(a_d, b_d, mask_d, n, m)

#elif HAVE_OPENCL

    call neko_error('No OpenCL bcknd, masked red copy')

#else

    call neko_error('no device backend configured')

#endif

  subroutine device_masked_red_copy(a_d, b_d, mask_d, n, m) …

  end subroutine device_masked_red_copy


  subroutine device_masked_atomic_reduction(a_d, b_d, mask_d, n, m)

    type(c_ptr) :: a_d, b_d, mask_d

    integer :: n, m

#if HAVE_HIP

    call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, m)

#elif HAVE_CUDA

    call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, m)

#elif HAVE_OPENCL

    call neko_error('No OpenCL bcknd, masked atomic reduction')

#else

    call neko_error('no device backend configured')

#endif

  subroutine device_masked_atomic_reduction(a_d, b_d, mask_d, n, m) …

  end subroutine device_masked_atomic_reduction


  subroutine device_cfill_mask(a_d, c, size, mask_d, mask_size)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: size

    type(c_ptr) :: mask_d

    integer :: mask_size

#if HAVE_HIP

    call hip_cfill_mask(a_d, c, size, mask_d, mask_size)

#elif HAVE_CUDA

    call cuda_cfill_mask(a_d, c, size, mask_d, mask_size)

#elif HAVE_OPENCL

    call opencl_cfill_mask(a_d, c, size, mask_d, mask_size)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cfill_mask(a_d, c, size, mask_d, mask_size) …

  end subroutine device_cfill_mask


  subroutine device_rzero(a_d, n)

    type(c_ptr) :: a_d

    integer :: n

#if HAVE_HIP

    call hip_rzero(a_d, n)

#elif HAVE_CUDA

    call cuda_rzero(a_d, n)

#elif HAVE_OPENCL

    call opencl_rzero(a_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_rzero(a_d, n) …

  end subroutine device_rzero


  subroutine device_rone(a_d, n)

    type(c_ptr) :: a_d

    integer :: n

    real(kind=rp), parameter :: one = 1.0_rp

#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL

    call device_cfill(a_d, one, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_rone(a_d, n) …

  end subroutine device_rone


  subroutine device_cmult(a_d, c, n)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: n

#if HAVE_HIP

    call hip_cmult(a_d, c, n)

#elif HAVE_CUDA

    call cuda_cmult(a_d, c, n)

#elif HAVE_OPENCL

    call opencl_cmult(a_d, c, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cmult(a_d, c, n) …

  end subroutine device_cmult


  subroutine device_cmult2(a_d, b_d, c, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp), intent(in) :: c

    integer :: n

#if HAVE_HIP

    call hip_cmult2(a_d, b_d, c, n)

#elif HAVE_CUDA

    call cuda_cmult2(a_d, b_d, c, n)

#elif HAVE_OPENCL

    call opencl_cmult2(a_d, b_d, c, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cmult2(a_d, b_d, c, n) …

  end subroutine device_cmult2


  subroutine device_cadd(a_d, c, n)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: n

#if HAVE_HIP

    call hip_cadd(a_d, c, n)

#elif HAVE_CUDA

    call cuda_cadd(a_d, c, n)

#elif HAVE_OPENCL

    call opencl_cadd(a_d, c, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cadd(a_d, c, n) …

  end subroutine device_cadd


  subroutine device_cadd2(a_d, b_d, c, n)

    type(c_ptr) :: a_d

    type(c_ptr) :: b_d

    real(kind=rp), intent(in) :: c

    integer :: n

#if HAVE_HIP

    call hip_cadd2(a_d, b_d, c, n)

#elif HAVE_CUDA

    call cuda_cadd2(a_d, b_d, c, n)

#elif HAVE_OPENCL

    call opencl_cadd2(a_d, b_d, c, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cadd2(a_d, b_d, c, n) …

  end subroutine device_cadd2


  subroutine device_cfill(a_d, c, n)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: n

#if HAVE_HIP

    call hip_cfill(a_d, c, n)

#elif HAVE_CUDA

    call cuda_cfill(a_d, c, n)

#elif HAVE_OPENCL

    call opencl_cfill(a_d, c, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_cfill(a_d, c, n) …

  end subroutine device_cfill


  subroutine device_add2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n

#if HAVE_HIP

    call hip_add2(a_d, b_d, n)

#elif HAVE_CUDA

    call cuda_add2(a_d, b_d, n)

#elif HAVE_OPENCL

    call opencl_add2(a_d, b_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add2(a_d, b_d, n) …

  end subroutine device_add2


  subroutine device_add4(a_d, b_d, c_d, d_d, n)

    type(c_ptr) :: a_d, b_d, c_d, d_d

    integer :: n

#if HAVE_HIP

    call hip_add4(a_d, b_d, c_d, d_d, n)

#elif HAVE_CUDA

    call cuda_add4(a_d, b_d, c_d, d_d, n)

#elif HAVE_OPENCL

    call opencl_add4(a_d, b_d, c_d, d_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add4(a_d, b_d, c_d, d_d, n) …

  end subroutine device_add4


  subroutine device_add2s1(a_d, b_d, c1, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp) :: c1

    integer :: n

#if HAVE_HIP

    call hip_add2s1(a_d, b_d, c1, n)

#elif HAVE_CUDA

    call cuda_add2s1(a_d, b_d, c1, n)

#elif HAVE_OPENCL

    call opencl_add2s1(a_d, b_d, c1, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add2s1(a_d, b_d, c1, n) …

  end subroutine device_add2s1


  subroutine device_add2s2(a_d, b_d, c1, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp) :: c1

    integer :: n

#if HAVE_HIP

    call hip_add2s2(a_d, b_d, c1, n)

#elif HAVE_CUDA

    call cuda_add2s2(a_d, b_d, c1, n)

#elif HAVE_OPENCL

    call opencl_add2s2(a_d, b_d, c1, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add2s2(a_d, b_d, c1, n) …

  end subroutine device_add2s2


  subroutine device_addsqr2s2(a_d, b_d, c1, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp) :: c1

    integer :: n

#if HAVE_HIP

    call hip_addsqr2s2(a_d, b_d, c1, n)

#elif HAVE_CUDA

    call cuda_addsqr2s2(a_d, b_d, c1, n)

#elif HAVE_OPENCL

    call opencl_addsqr2s2(a_d, b_d, c1, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_addsqr2s2(a_d, b_d, c1, n) …

  end subroutine device_addsqr2s2


  subroutine device_add3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n

#if HAVE_HIP

    call hip_add3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    call cuda_add3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call opencl_add3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add3(a_d, b_d, c_d, n) …

  end subroutine device_add3


  subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n)

    type(c_ptr) :: a_d, b_d, c_d

    real(kind=rp) :: c1, c2

    integer :: n

#if HAVE_HIP

    call hip_add3s2(a_d, b_d, c_d, c1, c2, n)

#elif HAVE_CUDA

    call cuda_add3s2(a_d, b_d, c_d, c1, c2, n)

#elif HAVE_OPENCL

    call opencl_add3s2(a_d, b_d, c_d, c1, c2, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n) …

  end subroutine device_add3s2


  subroutine device_invcol1(a_d, n)

    type(c_ptr) :: a_d

    integer :: n

#if HAVE_HIP

    call hip_invcol1(a_d, n)

#elif HAVE_CUDA

    call cuda_invcol1(a_d, n)

#elif HAVE_OPENCL

    call opencl_invcol1(a_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_invcol1(a_d, n) …

  end subroutine device_invcol1


  subroutine device_invcol2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n

#if HAVE_HIP

    call hip_invcol2(a_d, b_d, n)

#elif HAVE_CUDA

    call cuda_invcol2(a_d, b_d, n)

#elif HAVE_OPENCL

    call opencl_invcol2(a_d, b_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_invcol2(a_d, b_d, n) …

  end subroutine device_invcol2


  subroutine device_col2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n

#if HAVE_HIP

    call hip_col2(a_d, b_d, n)

#elif HAVE_CUDA

    call cuda_col2(a_d, b_d, n)

#elif HAVE_OPENCL

    call opencl_col2(a_d, b_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_col2(a_d, b_d, n) …

  end subroutine device_col2


  subroutine device_col3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n

#if HAVE_HIP

    call hip_col3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    call cuda_col3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call opencl_col3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_col3(a_d, b_d, c_d, n) …

  end subroutine device_col3


  subroutine device_subcol3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n

#if HAVE_HIP

    call hip_subcol3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    call cuda_subcol3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call opencl_subcol3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_subcol3(a_d, b_d, c_d, n) …

  end subroutine device_subcol3


  subroutine device_sub2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n

#if HAVE_HIP

    call hip_sub2(a_d, b_d, n)

#elif HAVE_CUDA

    call cuda_sub2(a_d, b_d, n)

#elif HAVE_OPENCL

    call opencl_sub2(a_d, b_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_sub2(a_d, b_d, n) …

  end subroutine device_sub2


  subroutine device_sub3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n

#if HAVE_HIP

    call hip_sub3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    call cuda_sub3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call opencl_sub3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_sub3(a_d, b_d, c_d, n) …

  end subroutine device_sub3


  subroutine device_addcol3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n

#if HAVE_HIP

    call hip_addcol3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    call cuda_addcol3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call opencl_addcol3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_addcol3(a_d, b_d, c_d, n) …

  end subroutine device_addcol3


  subroutine device_addcol4(a_d, b_d, c_d, d_d, n)

    type(c_ptr) :: a_d, b_d, c_d, d_d

    integer :: n

#if HAVE_HIP

    call hip_addcol4(a_d, b_d, c_d, d_d, n)

#elif HAVE_CUDA

    call cuda_addcol4(a_d, b_d, c_d, d_d, n)

#elif HAVE_OPENCL

    call opencl_addcol4(a_d, b_d, c_d, d_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_addcol4(a_d, b_d, c_d, d_d, n) …

  end subroutine device_addcol4


  subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n)

    type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d

    integer :: n

#if HAVE_HIP

    call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n)

#elif HAVE_CUDA

    call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n)

#elif HAVE_OPENCL

    call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n) …

  end subroutine device_vdot3


  subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &

       w1_d, w2_d, w3_d, n)

    type(c_ptr) :: u1_d, u2_d, u3_d

    type(c_ptr) :: v1_d, v2_d, v3_d

    type(c_ptr) :: w1_d, w2_d, w3_d

    integer :: n

#if HAVE_HIP

    call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &

         w1_d, w2_d, w3_d, n)

#elif HAVE_CUDA

    call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &

         w1_d, w2_d, w3_d, n)

#elif HAVE_OPENCL

    call neko_error("no opencl backedn vcross")

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, & …

  end subroutine device_vcross


  function device_vlsc3(u_d, v_d, w_d, n) result(res)

    type(c_ptr) :: u_d, v_d, w_d

    integer :: n

    real(kind=rp) :: res

    res = 0.0_rp

#if HAVE_HIP

    res = hip_vlsc3(u_d, v_d, w_d, n)

#elif HAVE_CUDA

    res = cuda_vlsc3(u_d, v_d, w_d, n)

#elif HAVE_OPENCL

    ! Same kernel as glsc3 (currently no device MPI for OpenCL)

    res = opencl_glsc3(u_d, v_d, w_d, n)

#else

    call neko_error('No device backend configured')

#endif

  function device_vlsc3(u_d, v_d, w_d, n) result(res) …

  end function device_vlsc3


  function device_glsc3(a_d, b_d, c_d, n) result(res)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n, ierr

    real(kind=rp) :: res

#if HAVE_HIP

    res = hip_glsc3(a_d, b_d, c_d, n)

#elif HAVE_CUDA

    res = cuda_glsc3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    res = opencl_glsc3(a_d, b_d, c_d, n)

#else

    call neko_error('No device backend configured')

#endif


#ifndef HAVE_DEVICE_MPI

    if (pe_size .gt. 1) then

       call mpi_allreduce(mpi_in_place, res, 1, &

            mpi_real_precision, mpi_sum, neko_comm, ierr)

    end if

#endif

  function device_glsc3(a_d, b_d, c_d, n) result(res) …

  end function device_glsc3


  subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n)

    type(c_ptr), value :: w_d, v_d_d, mult_d

    integer(c_int) :: j, n

    real(c_rp) :: h(j)

    integer :: ierr

#if HAVE_HIP

    call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n)

#elif HAVE_CUDA

    call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n)

#elif HAVE_OPENCL

    call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n)

#else

    call neko_error('No device backend configured')

#endif


#ifndef HAVE_DEVICE_MPI

    if (pe_size .gt. 1) then

       call mpi_allreduce(mpi_in_place, h, j, &

            mpi_real_precision, mpi_sum, neko_comm, ierr)

    end if

#endif

  subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n) …

  end subroutine device_glsc3_many


  subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n)

    type(c_ptr), value :: y_d, x_d_d, a_d

    integer(c_int) :: j, n

#if HAVE_HIP

    call hip_add2s2_many(y_d, x_d_d, a_d, j, n)

#elif HAVE_CUDA

    call cuda_add2s2_many(y_d, x_d_d, a_d, j, n)

#elif HAVE_OPENCL

    call opencl_add2s2_many(y_d, x_d_d, a_d, j, n)

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n) …

  end subroutine device_add2s2_many


  function device_glsc2(a_d, b_d, n) result(res)

    type(c_ptr) :: a_d, b_d

    integer :: n, ierr

    real(kind=rp) :: res

#if HAVE_HIP

    res = hip_glsc2(a_d, b_d, n)

#elif HAVE_CUDA

    res = cuda_glsc2(a_d, b_d, n)

#elif HAVE_OPENCL

    res = opencl_glsc2(a_d, b_d, n)

#else

    call neko_error('No device backend configured')

#endif


#ifndef HAVE_DEVICE_MPI

    if (pe_size .gt. 1) then

       call mpi_allreduce(mpi_in_place, res, 1, &

            mpi_real_precision, mpi_sum, neko_comm, ierr)

    end if

#endif

  function device_glsc2(a_d, b_d, n) result(res) …

  end function device_glsc2


  function device_glsum(a_d, n) result(res)

    type(c_ptr) :: a_d

    integer :: n, ierr

    real(kind=rp) :: res

#if HAVE_HIP

    res = hip_glsum(a_d, n)

#elif HAVE_CUDA

    res = cuda_glsum(a_d, n)

#elif HAVE_OPENCL

    res = opencl_glsum(a_d, n)

#else

    call neko_error('No device backend configured')

#endif


#ifndef HAVE_DEVICE_MPI

    if (pe_size .gt. 1) then

       call mpi_allreduce(mpi_in_place, res, 1, &

            mpi_real_precision, mpi_sum, neko_comm, ierr)

    end if

#endif

  function device_glsum(a_d, n) result(res) …

  end function device_glsum


  subroutine device_absval(a_d, n)

    integer, intent(in) :: n

    type(c_ptr) :: a_d

#ifdef HAVE_HIP

    call hip_absval(a_d, n)

#elif HAVE_CUDA

    call cuda_absval(a_d, n)

#elif HAVE_OPENCL

    call neko_error('OPENCL is not implemented for device_absval')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_absval(a_d, n) …

  end subroutine device_absval


  ! ========================================================================== !

  ! Device point-wise max


  subroutine device_pwmax_vec2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmax_vec2')

#elif HAVE_CUDA

    call cuda_pwmax_vec2(a_d, b_d, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmax_vec2')

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_pwmax_vec2(a_d, b_d, n) …

  end subroutine device_pwmax_vec2


  subroutine device_pwmax_vec3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmax_vec3')

#elif HAVE_CUDA

    call cuda_pwmax_vec3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmax_vec3')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmax_vec3(a_d, b_d, c_d, n) …

  end subroutine device_pwmax_vec3


  subroutine device_pwmax_sca2(a_d, c, n)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmax_sca2')

#elif HAVE_CUDA

    call cuda_pwmax_sca2(a_d, c, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmax_sca2')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmax_sca2(a_d, c, n) …

  end subroutine device_pwmax_sca2


  subroutine device_pwmax_sca3(a_d, b_d, c, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp), intent(in) :: c

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmax_sca3')

#elif HAVE_CUDA

    call cuda_pwmax_sca3(a_d, b_d, c, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmax_sca3')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmax_sca3(a_d, b_d, c, n) …

  end subroutine device_pwmax_sca3


  ! ========================================================================== !

  ! Device point-wise min


  subroutine device_pwmin_vec2(a_d, b_d, n)

    type(c_ptr) :: a_d, b_d

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmin_vec2')

#elif HAVE_CUDA

    call cuda_pwmin_vec2(a_d, b_d, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmin_vec2')

#else

    call neko_error('No device backend configured')

#endif

  subroutine device_pwmin_vec2(a_d, b_d, n) …

  end subroutine device_pwmin_vec2


  subroutine device_pwmin_vec3(a_d, b_d, c_d, n)

    type(c_ptr) :: a_d, b_d, c_d

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmin_vec3')

#elif HAVE_CUDA

    call cuda_pwmin_vec3(a_d, b_d, c_d, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmin_vec3')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmin_vec3(a_d, b_d, c_d, n) …

  end subroutine device_pwmin_vec3


  subroutine device_pwmin_sca2(a_d, c, n)

    type(c_ptr) :: a_d

    real(kind=rp), intent(in) :: c

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmin_sca2')

#elif HAVE_CUDA

    call cuda_pwmin_sca2(a_d, c, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmin_sca2')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmin_sca2(a_d, c, n) …

  end subroutine device_pwmin_sca2


  subroutine device_pwmin_sca3(a_d, b_d, c, n)

    type(c_ptr) :: a_d, b_d

    real(kind=rp), intent(in) :: c

    integer :: n


#if HAVE_HIP

    call neko_error('No HIP backend for device_pwmin_sca3')

#elif HAVE_CUDA

    call cuda_pwmin_sca3(a_d, b_d, c, n)

#elif HAVE_OPENCL

    call neko_error('No OpenCL backend for device_pwmin_sca3')

#else

    call neko_error('No device backend configured')

#endif


  subroutine device_pwmin_sca3(a_d, b_d, c, n) …

  end subroutine device_pwmin_sca3


end module device_math

cuda_math::cuda_absval
Definition cuda_math.f90:319

cuda_math::cuda_add2
Definition cuda_math.f90:131

cuda_math::cuda_add2s1
Definition cuda_math.f90:147

cuda_math::cuda_add2s2_many
Definition cuda_math.f90:278

cuda_math::cuda_add2s2
Definition cuda_math.f90:156

cuda_math::cuda_add3
Definition cuda_math.f90:232

cuda_math::cuda_add3s2
Definition cuda_math.f90:174

cuda_math::cuda_add4
Definition cuda_math.f90:139

cuda_math::cuda_addcol3
Definition cuda_math.f90:239

cuda_math::cuda_addcol4
Definition cuda_math.f90:246

cuda_math::cuda_addsqr2s2
Definition cuda_math.f90:165

cuda_math::cuda_cadd2
Definition cuda_math.f90:105

cuda_math::cuda_cadd
Definition cuda_math.f90:96

cuda_math::cuda_cfill_mask
Definition cuda_math.f90:67

cuda_math::cuda_cfill
Definition cuda_math.f90:115

cuda_math::cuda_cmult2
Definition cuda_math.f90:87

cuda_math::cuda_cmult
Definition cuda_math.f90:78

cuda_math::cuda_col2
Definition cuda_math.f90:197

cuda_math::cuda_col3
Definition cuda_math.f90:204

cuda_math::cuda_copy
Definition cuda_math.f90:39

cuda_math::cuda_glsc2
Definition cuda_math.f90:303

cuda_math::cuda_glsc3_many
Definition cuda_math.f90:294

cuda_math::cuda_glsc3
Definition cuda_math.f90:286

cuda_math::cuda_glsum
Definition cuda_math.f90:311

cuda_math::cuda_invcol1
Definition cuda_math.f90:183

cuda_math::cuda_invcol2
Definition cuda_math.f90:190

cuda_math::cuda_masked_atomic_reduction
Definition cuda_math.f90:60

cuda_math::cuda_masked_copy
Definition cuda_math.f90:46

cuda_math::cuda_masked_red_copy
Definition cuda_math.f90:53

cuda_math::cuda_pwmax_sca2
Definition cuda_math.f90:346

cuda_math::cuda_pwmax_sca3
Definition cuda_math.f90:355

cuda_math::cuda_pwmax_vec2
Definition cuda_math.f90:332

cuda_math::cuda_pwmax_vec3
Definition cuda_math.f90:339

cuda_math::cuda_pwmin_sca2
Definition cuda_math.f90:378

cuda_math::cuda_pwmin_sca3
Definition cuda_math.f90:387

cuda_math::cuda_pwmin_vec2
Definition cuda_math.f90:364

cuda_math::cuda_pwmin_vec3
Definition cuda_math.f90:371

cuda_math::cuda_rzero
Definition cuda_math.f90:124

cuda_math::cuda_sub2
Definition cuda_math.f90:218

cuda_math::cuda_sub3
Definition cuda_math.f90:225

cuda_math::cuda_subcol3
Definition cuda_math.f90:211

cuda_math::cuda_vcross
Definition cuda_math.f90:260

cuda_math::cuda_vdot3
Definition cuda_math.f90:253

cuda_math::cuda_vlsc3
Definition cuda_math.f90:270

device_math::device_pwmax
Definition device_math.F90:50

device_math::device_pwmin
Definition device_math.F90:55

hip_math::hip_absval
Definition hip_math.f90:343

hip_math::hip_add2
Definition hip_math.f90:131

hip_math::hip_add2s1
Definition hip_math.f90:149

hip_math::hip_add2s2_many
Definition hip_math.f90:169

hip_math::hip_add2s2
Definition hip_math.f90:159

hip_math::hip_add3
Definition hip_math.f90:254

hip_math::hip_add3s2
Definition hip_math.f90:188

hip_math::hip_add4
Definition hip_math.f90:140

hip_math::hip_addcol3
Definition hip_math.f90:262

hip_math::hip_addcol4
Definition hip_math.f90:270

hip_math::hip_addsqr2s2
Definition hip_math.f90:178

hip_math::hip_cadd2
Definition hip_math.f90:105

hip_math::hip_cadd
Definition hip_math.f90:96

hip_math::hip_cfill_mask
Definition hip_math.f90:67

hip_math::hip_cfill
Definition hip_math.f90:115

hip_math::hip_cmult2
Definition hip_math.f90:87

hip_math::hip_cmult
Definition hip_math.f90:78

hip_math::hip_col2
Definition hip_math.f90:214

hip_math::hip_col3
Definition hip_math.f90:222

hip_math::hip_copy
Definition hip_math.f90:39

hip_math::hip_glsc2
Definition hip_math.f90:325

hip_math::hip_glsc3_many
Definition hip_math.f90:315

hip_math::hip_glsc3
Definition hip_math.f90:306

hip_math::hip_glsum
Definition hip_math.f90:334

hip_math::hip_invcol1
Definition hip_math.f90:198

hip_math::hip_invcol2
Definition hip_math.f90:206

hip_math::hip_masked_atomic_reduction
Definition hip_math.f90:60

hip_math::hip_masked_copy
Definition hip_math.f90:46

hip_math::hip_masked_red_copy
Definition hip_math.f90:53

hip_math::hip_rzero
Definition hip_math.f90:124

hip_math::hip_sub2
Definition hip_math.f90:238

hip_math::hip_sub3
Definition hip_math.f90:246

hip_math::hip_subcol3
Definition hip_math.f90:230

hip_math::hip_vcross
Definition hip_math.f90:286

hip_math::hip_vdot3
Definition hip_math.f90:278

hip_math::hip_vlsc3
Definition hip_math.f90:297

opencl_math::opencl_add2
Definition opencl_math.f90:124

opencl_math::opencl_add2s1
Definition opencl_math.f90:140

opencl_math::opencl_add2s2_many
Definition opencl_math.f90:160

opencl_math::opencl_add2s2
Definition opencl_math.f90:150

opencl_math::opencl_add3
Definition opencl_math.f90:245

opencl_math::opencl_add3s2
Definition opencl_math.f90:179

opencl_math::opencl_add4
Definition opencl_math.f90:132

opencl_math::opencl_addcol3
Definition opencl_math.f90:253

opencl_math::opencl_addcol4
Definition opencl_math.f90:261

opencl_math::opencl_addsqr2s2
Definition opencl_math.f90:169

opencl_math::opencl_cadd2
Definition opencl_math.f90:91

opencl_math::opencl_cadd
Definition opencl_math.f90:82

opencl_math::opencl_cfill_mask
Definition opencl_math.f90:53

opencl_math::opencl_cfill
Definition opencl_math.f90:101

opencl_math::opencl_cmult2
Definition opencl_math.f90:73

opencl_math::opencl_cmult
Definition opencl_math.f90:64

opencl_math::opencl_col2
Definition opencl_math.f90:205

opencl_math::opencl_col3
Definition opencl_math.f90:213

opencl_math::opencl_copy
Definition opencl_math.f90:39

opencl_math::opencl_glsc2
Definition opencl_math.f90:296

opencl_math::opencl_glsc3_many
Definition opencl_math.f90:286

opencl_math::opencl_glsc3
Definition opencl_math.f90:277

opencl_math::opencl_glsum
Definition opencl_math.f90:305

opencl_math::opencl_invcol1
Definition opencl_math.f90:189

opencl_math::opencl_invcol2
Definition opencl_math.f90:197

opencl_math::opencl_masked_copy
Definition opencl_math.f90:46

opencl_math::opencl_rzero
Definition opencl_math.f90:110

opencl_math::opencl_sub2
Definition opencl_math.f90:229

opencl_math::opencl_sub3
Definition opencl_math.f90:237

opencl_math::opencl_subcol3
Definition opencl_math.f90:221

opencl_math::opencl_vdot3
Definition opencl_math.f90:269

utils::neko_error
Definition utils.f90:42

comm
Definition comm.F90:1

comm::neko_comm
type(mpi_comm) neko_comm
MPI communicator.
Definition comm.F90:38

comm::mpi_real_precision
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:46

comm::pe_size
integer pe_size
MPI size of communicator.
Definition comm.F90:54

cuda_math
Definition cuda_math.f90:33

device_math
Definition device_math.F90:33

device_math::device_add2
subroutine, public device_add2(a_d, b_d, n)
Vector addition .
Definition device_math.F90:261

device_math::device_pwmax_sca2
subroutine device_pwmax_sca2(a_d, c, n)
Compute the point-wise maximum of a vector and a scalar .
Definition device_math.F90:720

device_math::device_addcol3
subroutine, public device_addcol3(a_d, b_d, c_d, n)
Returns .
Definition device_math.F90:474

device_math::device_col2
subroutine, public device_col2(a_d, b_d, n)
Vector multiplication .
Definition device_math.F90:399

device_math::device_add2s1
subroutine, public device_add2s1(a_d, b_d, c1, n)
Definition device_math.F90:289

device_math::device_rzero
subroutine, public device_rzero(a_d, n)
Zero a real vector.
Definition device_math.F90:153

device_math::device_vlsc3
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n)
Compute multiplication sum .
Definition device_math.F90:542

device_math::device_rone
subroutine, public device_rone(a_d, n)
Set all elements to one.
Definition device_math.F90:168

device_math::device_add2s2
subroutine, public device_add2s2(a_d, b_d, c1, n)
Vector addition with scalar multiplication  (multiplication on first argument)
Definition device_math.F90:306

device_math::device_pwmax_vec2
subroutine device_pwmax_vec2(a_d, b_d, n)
Compute the point-wise maximum of two vectors .
Definition device_math.F90:685

device_math::device_invcol1
subroutine, public device_invcol1(a_d, n)
Invert a vector .
Definition device_math.F90:369

device_math::device_col3
subroutine, public device_col3(a_d, b_d, c_d, n)
Vector multiplication with 3 vectors .
Definition device_math.F90:414

device_math::device_add4
subroutine, public device_add4(a_d, b_d, c_d, d_d, n)
Definition device_math.F90:275

device_math::device_cadd
subroutine, public device_cadd(a_d, c, n)
Add a scalar to vector .
Definition device_math.F90:212

device_math::device_masked_red_copy
subroutine, public device_masked_red_copy(a_d, b_d, mask_d, n, m)
Definition device_math.F90:105

device_math::device_vdot3
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n)
Compute a dot product  (3-d version) assuming vector components  etc.
Definition device_math.F90:505

device_math::device_cmult2
subroutine, public device_cmult2(a_d, b_d, c, n)
Multiplication by constant c .
Definition device_math.F90:196

device_math::device_vcross
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n)
Compute a cross product  (3-d version) assuming vector components  etc.
Definition device_math.F90:522

device_math::device_cmult
subroutine, public device_cmult(a_d, c, n)
Multiplication by constant c .
Definition device_math.F90:180

device_math::device_pwmax_sca3
subroutine device_pwmax_sca3(a_d, b_d, c, n)
Compute the point-wise maximum of a vector and a scalar .
Definition device_math.F90:739

device_math::device_absval
subroutine, public device_absval(a_d, n)
Definition device_math.F90:665

device_math::device_masked_copy
subroutine, public device_masked_copy(a_d, b_d, mask_d, n, m)
Copy a masked vector .
Definition device_math.F90:91

device_math::device_pwmax_vec3
subroutine device_pwmax_vec3(a_d, b_d, c_d, n)
Compute the point-wise maximum of two vectors .
Definition device_math.F90:702

device_math::device_add2s2_many
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n)
Definition device_math.F90:605

device_math::device_pwmin_sca3
subroutine device_pwmin_sca3(a_d, b_d, c, n)
Compute the point-wise minimum of a vector and a scalar .
Definition device_math.F90:815

device_math::device_masked_atomic_reduction
subroutine, public device_masked_atomic_reduction(a_d, b_d, mask_d, n, m)
Definition device_math.F90:119

device_math::device_cfill_mask
subroutine, public device_cfill_mask(a_d, c, size, mask_d, mask_size)
Fill a constant to a masked vector. .
Definition device_math.F90:135

device_math::device_glsc2
real(kind=rp) function, public device_glsc2(a_d, b_d, n)
Weighted inner product .
Definition device_math.F90:620

device_math::device_sub3
subroutine, public device_sub3(a_d, b_d, c_d, n)
Vector subtraction .
Definition device_math.F90:459

device_math::device_glsc3
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
Definition device_math.F90:560

device_math::device_add3
subroutine, public device_add3(a_d, b_d, c_d, n)
Vector addition .
Definition device_math.F90:338

device_math::device_pwmin_vec3
subroutine device_pwmin_vec3(a_d, b_d, c_d, n)
Compute the point-wise minimum of two vectors .
Definition device_math.F90:778

device_math::device_glsum
real(kind=rp) function, public device_glsum(a_d, n)
Sum a vector of length n.
Definition device_math.F90:643

device_math::device_cadd2
subroutine, public device_cadd2(a_d, b_d, c, n)
Add a scalar to vector .
Definition device_math.F90:228

device_math::device_copy
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Definition device_math.F90:76

device_math::device_add3s2
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n)
Returns .
Definition device_math.F90:353

device_math::device_subcol3
subroutine, public device_subcol3(a_d, b_d, c_d, n)
Returns .
Definition device_math.F90:429

device_math::device_glsc3_many
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n)
Definition device_math.F90:582

device_math::device_sub2
subroutine, public device_sub2(a_d, b_d, n)
Vector substraction .
Definition device_math.F90:444

device_math::device_cfill
subroutine, public device_cfill(a_d, c, n)
Set all elements to a constant c .
Definition device_math.F90:245

device_math::device_addcol4
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n)
Returns .
Definition device_math.F90:489

device_math::device_pwmin_sca2
subroutine device_pwmin_sca2(a_d, c, n)
Compute the point-wise minimum of a vector and a scalar .
Definition device_math.F90:796

device_math::device_pwmin_vec2
subroutine device_pwmin_vec2(a_d, b_d, n)
Compute the point-wise minimum of two vectors .
Definition device_math.F90:761

device_math::device_invcol2
subroutine, public device_invcol2(a_d, b_d, n)
Vector division .
Definition device_math.F90:384

device_math::device_addsqr2s2
subroutine, public device_addsqr2s2(a_d, b_d, c1, n)
Returns .
Definition device_math.F90:322

hip_math
Definition hip_math.f90:33

num_types
Definition num_types.f90:1

num_types::c_rp
integer, parameter, public c_rp
Definition num_types.f90:13

num_types::rp
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12

opencl_math
Definition opencl_math.f90:33

utils
Utilities.
Definition utils.f90:35