d2/d11/gs__device_8F90_source.html

! Copyright (c) 2021-2022, The Neko Authors

! All rights reserved.

!

! Redistribution and use in source and binary forms, with or without

! modification, are permitted provided that the following conditions

! are met:

!

!   * Redistributions of source code must retain the above copyright

!     notice, this list of conditions and the following disclaimer.

!

!   * Redistributions in binary form must reproduce the above

!     copyright notice, this list of conditions and the following

!     disclaimer in the documentation and/or other materials provided

!     with the distribution.

!

!   * Neither the name of the authors nor the names of its

!     contributors may be used to endorse or promote products derived

!     from this software without specific prior written permission.

!

! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

! POSSIBILITY OF SUCH DAMAGE.

!

module gs_device

  use neko_config

  use num_types

  use gs_bcknd

  use device

  use gs_ops

  use utils

  use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &

                                          c_associated

  implicit none

  private


  type, public, extends(gs_bcknd_t) :: gs_device_t

     integer, allocatable :: local_blk_off(:)

     integer, allocatable :: shared_blk_off(:)

     type(c_ptr) :: local_gs_d = c_null_ptr

     type(c_ptr) :: local_dof_gs_d = c_null_ptr

     type(c_ptr) :: local_gs_dof_d = c_null_ptr

     type(c_ptr) :: shared_gs_d = c_null_ptr

     type(c_ptr) :: shared_dof_gs_d = c_null_ptr

     type(c_ptr) :: shared_gs_dof_d = c_null_ptr

     type(c_ptr) :: local_blk_len_d = c_null_ptr

     type(c_ptr) :: shared_blk_len_d = c_null_ptr

     type(c_ptr) :: local_blk_off_d = c_null_ptr

     type(c_ptr) :: shared_blk_off_d = c_null_ptr

     integer :: nlocal

     integer :: nshared

     logical :: shared_on_host

   contains

     procedure, pass(this) :: init => gs_device_init

     procedure, pass(this) :: free => gs_device_free

     procedure, pass(this) :: gather => gs_gather_device

     procedure, pass(this) :: scatter => gs_scatter_device

  end type gs_device_t

  type, public, extends(gs_bcknd_t) :: gs_device_t …


#ifdef HAVE_HIP

  interface


     subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &

          bind(c, name='hip_gather_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb, o, op

       type(c_ptr), value :: v, u, dg, gd, b, bo, strm

     subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) & …

     end subroutine hip_gather_kernel

  end interface


  interface


     subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &

          bind(c, name='hip_scatter_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb

       type(c_ptr), value :: v, u, dg, gd, b, bo, strm

     subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) & …

     end subroutine hip_scatter_kernel

  end interface


#elif HAVE_CUDA

  interface

     subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &

          bind(c, name='cuda_gather_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb, o, op

       type(c_ptr), value :: v, u, dg, gd, b, bo, strm

     end subroutine cuda_gather_kernel

  end interface


  interface

     subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &

          bind(c, name='cuda_scatter_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb

       type(c_ptr), value :: v, u, dg, gd, b, bo, strm

     end subroutine cuda_scatter_kernel

  end interface

#elif HAVE_OPENCL

  interface

     subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &

          bind(c, name='opencl_gather_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb, o, op

       type(c_ptr), value :: v, u, dg, gd, b, bo

     end subroutine opencl_gather_kernel

  end interface


  interface

     subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo) &

          bind(c, name='opencl_scatter_kernel')

       use, intrinsic :: iso_c_binding

       implicit none

       integer(c_int) :: m, n, nb

       type(c_ptr), value :: v, u, dg, gd, b, bo

     end subroutine opencl_scatter_kernel

  end interface

#endif


contains


  subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)

    class(gs_device_t), intent(inout) :: this

    integer, intent(in) :: nlocal

    integer, intent(in) :: nshared

    integer, intent(in) :: nlcl_blks

    integer, intent(in) :: nshrd_blks


    call this%free()


    this%nlocal = nlocal

    this%nshared = nshared


    allocate(this%local_blk_off(nlcl_blks))

    allocate(this%shared_blk_off(nshrd_blks))


    this%local_gs_d = c_null_ptr

    this%local_dof_gs_d = c_null_ptr

    this%local_gs_dof_d = c_null_ptr

    this%local_blk_len_d = c_null_ptr

    this%local_blk_off_d = c_null_ptr

    this%shared_gs_d = c_null_ptr

    this%shared_dof_gs_d = c_null_ptr

    this%shared_gs_dof_d = c_null_ptr

    this%shared_blk_len_d = c_null_ptr

    this%shared_blk_off_d = c_null_ptr


    this%shared_on_host = .true.


#if defined(HAVE_HIP) || defined(HAVE_CUDA)

    call device_event_create(this%gather_event, 2)

    call device_event_create(this%scatter_event, 2)

#endif


    this%gs_stream = glb_cmd_queue


  subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks) …

  end subroutine gs_device_init


  subroutine gs_device_free(this)

    class(gs_device_t), intent(inout) :: this


    if (allocated(this%local_blk_off)) then

       deallocate(this%local_blk_off)

    end if


    if (allocated(this%shared_blk_off)) then

       deallocate(this%shared_blk_off)

    end if


    if (c_associated(this%local_gs_d)) then

       call device_free(this%local_gs_d)

    end if


    if (c_associated(this%local_dof_gs_d)) then

       call device_free(this%local_dof_gs_d)

    end if


    if (c_associated(this%local_gs_dof_d)) then

       call device_free(this%local_gs_dof_d)

    end if


    if (c_associated(this%local_blk_len_d)) then

       call device_free(this%local_blk_len_d)

    end if


    if (c_associated(this%shared_blk_len_d)) then

       call device_free(this%shared_blk_len_d)

    end if


    if (c_associated(this%local_blk_off_d)) then

       call device_free(this%local_blk_off_d)

    end if


    if (c_associated(this%shared_blk_off_d)) then

       call device_free(this%shared_blk_off_d)

    end if


    this%nlocal = 0

    this%nshared = 0


#if defined(HAVE_HIP) || defined(HAVE_CUDA)

    if (c_associated(this%gather_event)) then

       call device_event_destroy(this%gather_event)

    end if


    if (c_associated(this%scatter_event)) then

       call device_event_destroy(this%scatter_event)

    end if

#endif


    if (c_associated(this%gs_stream)) then

       this%gs_stream = c_null_ptr

    end if


  subroutine gs_device_free(this) …

  end subroutine gs_device_free


  subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)

    integer, intent(in) :: m

    integer, intent(in) :: n

    integer, intent(in) :: nb

    class(gs_device_t), intent(inout) :: this

    real(kind=rp), dimension(m), intent(inout) :: v

    integer, dimension(m), intent(inout) :: dg

    real(kind=rp), dimension(n), intent(inout) :: u

    integer, dimension(m), intent(inout) :: gd

    integer, dimension(nb), intent(inout) :: b

    integer, intent(in) :: o

    integer, intent(in) :: op

    logical, intent(in) :: shrd

    integer :: i

    type(c_ptr) :: u_d


    u_d = device_get_ptr(u)


    if (.not. shrd) then

       associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &

            gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &

            bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &

            strm=>this%gs_stream)


         if (.not. c_associated(v_d)) then

            call device_map(v, v_d, m)

         end if


         if (.not. c_associated(dg_d)) then

            call device_map(dg, dg_d, m)

            call device_memcpy(dg, dg_d, m, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(gd_d)) then

            call device_map(gd, gd_d, m)

            call device_memcpy(gd, gd_d, m, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(b_d)) then

            call device_map(b, b_d, nb)

            call device_memcpy(b, b_d, nb, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(bo_d)) then

            call device_map(bo, bo_d, nb)

            bo(1) = 0

            do  i = 2, nb

               bo(i) = bo(i - 1) + b(i - 1)

            end do

            call device_memcpy(bo, bo_d, nb, host_to_device, &

                               sync=.false., strm=strm)

         end if


#ifdef HAVE_HIP

         call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

                                nb, b_d, bo_d, op, strm)

#elif HAVE_CUDA

         call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

              nb, b_d, bo_d, op, strm)

#elif HAVE_OPENCL

         call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

                                   nb, b_d, bo_d, op)

#else

         call neko_error('No device backend configured')

#endif


       end associate

    else if (shrd) then

       associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &

            gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &

            bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &

            strm=>this%gs_stream)


         if (.not. c_associated(v_d)) then

            call device_map(v, v_d, m)

         end if


         if (.not. c_associated(dg_d)) then

            call device_map(dg, dg_d, m)

            call device_memcpy(dg, dg_d, m, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(gd_d)) then

            call device_map(gd, gd_d, m)

            call device_memcpy(gd, gd_d, m, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(b_d)) then

            call device_map(b, b_d, nb)

            call device_memcpy(b, b_d, nb, host_to_device, &

                               sync=.false., strm=strm)

         end if


         if (.not. c_associated(bo_d)) then

            call device_map(bo, bo_d, nb)

            bo(1) = 0

            do  i = 2, nb

               bo(i) = bo(i - 1) + b(i - 1)

            end do

            call device_memcpy(bo, bo_d, nb, host_to_device, &

                               sync=.false., strm=strm)

         end if


#ifdef HAVE_HIP

         call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

                                nb, b_d, bo_d, op, strm)

#elif HAVE_CUDA

         call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

              nb, b_d, bo_d, op, strm)

#elif HAVE_OPENCL

         call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &

                                   nb, b_d, bo_d, op)

#else

         call neko_error('No device backend configured')

#endif


#if defined(HAVE_HIP) || defined(HAVE_CUDA)

         call device_event_record(this%gather_event, strm)

#endif


         if (this%shared_on_host) then

            if (this%nshared .eq. m) then

               call device_memcpy(v, v_d, m, device_to_host, &

                                  sync=.true., strm=strm)

            end if

         end if


       end associate

    end if


  subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd) …

  end subroutine gs_gather_device


  subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)

    integer, intent(in) :: m

    integer, intent(in) :: n

    integer, intent(in) :: nb

    class(gs_device_t), intent(inout) :: this

    real(kind=rp), dimension(m), intent(inout) :: v

    integer, dimension(m), intent(inout) :: dg

    real(kind=rp), dimension(n), intent(inout) :: u

    integer, dimension(m), intent(inout) :: gd

    integer, dimension(nb), intent(inout) :: b

    logical, intent(in) :: shrd

    type(c_ptr) :: event

    type(c_ptr) :: u_d


    u_d = device_get_ptr(u)


    if (.not. shrd) then

       associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &

            gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &

            bo_d=>this%local_blk_off_d, strm=>this%gs_stream)

#ifdef HAVE_HIP

         call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)

#elif HAVE_CUDA

         call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)

#elif HAVE_OPENCL

         call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)

#else

         call neko_error('No device backend configured')

#endif

       end associate

    else if (shrd) then

       associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &

            gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &

            bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)


         if (this%shared_on_host) then

            call device_memcpy(v, v_d, m, host_to_device, &

                               sync=.false., strm=strm)

         end if


#ifdef HAVE_HIP

         call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)

#elif HAVE_CUDA

         call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)

#elif HAVE_OPENCL

         call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)

#else

         call neko_error('No device backend configured')

#endif


#if defined(HAVE_HIP) || defined(HAVE_CUDA)

         if (c_associated(event)) then

            call device_event_record(event, strm)

         else

            call device_sync(strm)

         end if

#endif


       end associate

    end if


  subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event) …

  end subroutine gs_scatter_device


end module gs_device

opencl_gather_kernel
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
Definition gs.c:58

opencl_scatter_kernel
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
Definition gs.c:166

cuda_gather_kernel
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51

cuda_scatter_kernel
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96

device::device_get_ptr
Return the device pointer for an associated Fortran array.
Definition device.F90:95

device::device_map
Map a Fortran array to a device (allocate and associate)
Definition device.F90:71

device::device_memcpy
Copy data between host and device (or device and device)
Definition device.F90:65

gs_device::hip_gather_kernel
Definition gs_device.F90:72

gs_device::hip_scatter_kernel
Definition gs_device.F90:82

utils::neko_error
Definition utils.f90:42

device
Device abstraction, common interface for various accelerators.
Definition device.F90:34

device::device_event_record
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1225

device::host_to_device
integer, parameter, public host_to_device
Definition device.F90:46

device::device_free
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:200

device::device_to_host
integer, parameter, public device_to_host
Definition device.F90:46

device::device_event_destroy
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1209

device::glb_cmd_queue
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:50

device::device_event_create
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1179

gs_bcknd
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34

gs_device
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34

gs_device::gs_gather_device
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
Definition gs_device.F90:234

gs_device::gs_scatter_device
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
Definition gs_device.F90:373

gs_device::gs_device_init
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
Definition gs_device.F90:137

gs_device::gs_device_free
subroutine gs_device_free(this)
Dummy backend deallocation.
Definition gs_device.F90:175

gs_ops
Defines Gather-scatter operations.
Definition gs_ops.f90:34

neko_config
Build configurations.
Definition neko_config.f90:34

num_types
Definition num_types.f90:1

num_types::rp
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12

utils
Utilities.
Definition utils.f90:35

gs_bcknd::gs_bcknd_t
Gather-scatter backend.
Definition gs_bcknd.f90:44

gs_device::gs_device_t
Gather-scatter backend for offloading devices.
Definition gs_device.F90:47