44  use, 
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
 
   45       c_ptr, c_null_ptr, c_size_t, c_associated
 
   51     integer, 
allocatable :: ndofs(:)
 
   52     integer, 
allocatable :: offset(:)
 
   53     integer, 
allocatable :: remote_offset(:)
 
   55     type(c_ptr) :: buf_d = c_null_ptr 
 
   56     type(c_ptr) :: dof_d = c_null_ptr 
 
 
   67     type(c_ptr), 
allocatable :: stream(:)
 
   68     type(c_ptr), 
allocatable :: event(:)
 
   69     integer :: nvshmem_counter = 1
 
   70     type(c_ptr), 
allocatable :: notifydone(:)
 
   71     type(c_ptr), 
allocatable :: notifyready(:)
 
 
   81#if defined (HAVE_CUDA) && defined(HAVE_NVSHMEM) 
   84     subroutine cudamalloc_nvshmem(ptr, size) &
 
   85          bind(c, name=
'cudamalloc_nvshmem')
 
   86       use, 
intrinsic :: iso_c_binding
 
   89       integer(c_size_t), 
value :: size
 
   90     end subroutine cudamalloc_nvshmem
 
   94     subroutine cudafree_nvshmem(ptr) &
 
   95          bind(c, name=
'cudafree_nvshmem')
 
   96       use, 
intrinsic :: iso_c_binding
 
   99     end subroutine cudafree_nvshmem
 
  103     subroutine cuda_gs_pack_and_push(u_d, buf_d, dof_d, offset, n, stream, &
 
  104          srank, rbuf_d, roffset, remote_offset, &
 
  105          rrank, nvshmem_counter, notifyDone, &
 
  107          bind(c, name=
'cuda_gs_pack_and_push')
 
  108       use, 
intrinsic :: iso_c_binding
 
  110       integer(c_int), 
value :: n, offset, srank, roffset, rrank, iter
 
  111       integer(c_int), 
value :: nvshmem_counter
 
  112       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream, rbuf_d, notifydone, notifyready
 
  113       integer(c_int),
dimension(*) :: remote_offset
 
  114     end subroutine cuda_gs_pack_and_push
 
  118     subroutine cuda_gs_pack_and_push_wait(stream, nvshmem_counter, notifyDone) &
 
  119          bind(c, name=
'cuda_gs_pack_and_push_wait')
 
  120       use, 
intrinsic :: iso_c_binding
 
  122       integer(c_int), 
value :: nvshmem_counter
 
  123       type(c_ptr), 
value :: stream, notifydone
 
  124     end subroutine cuda_gs_pack_and_push_wait
 
  128     subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
 
  129          bind(c, name=
'cuda_gs_unpack')
 
  130       use, 
intrinsic :: iso_c_binding
 
  132       integer(c_int), 
value :: op, offset, n
 
  133       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream
 
  142    integer, 
allocatable, 
intent(inout) :: pe_order(:)
 
  143    type(
stack_i4_t), 
allocatable, 
intent(inout) :: dof_stack(:)
 
  144    logical, 
intent(in) :: mark_dupes
 
  145    integer, 
allocatable :: dofs(:)
 
  146    integer :: i, j, total, max_total
 
  147    integer(c_size_t) :: sz
 
  149    integer :: dupe, marked, k
 
  150    real(c_rp) :: rp_dummy
 
  151    integer(c_int32_t) :: i4_dummy
 
  153    allocate(this%ndofs(
size(pe_order)))
 
  154    allocate(this%offset(
size(pe_order)))
 
  155    allocate(this%remote_offset(
size(pe_order)))
 
  157    do i = 1, 
size(pe_order)
 
  158       this%remote_offset(i)=-1
 
  162    do i = 1, 
size(pe_order)
 
  163       this%ndofs(i) = dof_stack(pe_order(i))%size()
 
  164       this%offset(i) = total
 
  165       total = total + this%ndofs(i)
 
  168    call mpi_allreduce(total, max_total, 1, mpi_integer, mpi_max, 
neko_comm)
 
  172    sz = c_sizeof(rp_dummy) * max_total
 
  174    call cudamalloc_nvshmem(this%buf_d, sz)
 
  177    sz = c_sizeof(i4_dummy) * total
 
  180    if (mark_dupes) 
call doftable%init(2*total)
 
  181    allocate(dofs(total))
 
  185    do i = 1, 
size(pe_order)
 
  187       select type (arr => dof_stack(pe_order(i))%data)
 
  189          do j = 1, this%ndofs(i)
 
  190             k = this%offset(i) + j
 
  192                if (doftable%get(arr(j), dupe) .eq. 0) 
then 
  193                   if (dofs(dupe) .gt. 0) 
then 
  194                      dofs(dupe) = -dofs(dupe)
 
  200                   call doftable%set(arr(j), k)
 
 
  221    if (
allocated(this%ndofs)) 
deallocate(this%ndofs)
 
  222    if (
allocated(this%offset)) 
deallocate(this%offset)
 
  225    if (c_associated(this%buf_d)) 
call cudafree_nvshmem(this%buf_d)
 
  227    if (c_associated(this%dof_d)) 
call device_free(this%dof_d)
 
 
  238    call this%init_order(send_pe, recv_pe)
 
  240    call this%send_buf%init(this%send_pe, this%send_dof, .false.)
 
  241    call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
 
  243#if defined(HAVE_HIP) || defined(HAVE_CUDA) 
  245    allocate(this%stream(
size(this%recv_pe)))
 
  246    do i = 1, 
size(this%recv_pe)
 
  250    allocate(this%event(
size(this%recv_pe)))
 
  251    do i = 1, 
size(this%recv_pe)
 
  256    allocate(this%notifyDone(
size(this%recv_pe)))
 
  257    allocate(this%notifyReady(
size(this%recv_pe)))
 
  258    do i = 1, 
size(this%recv_pe)
 
  259       call cudamalloc_nvshmem(this%notifyDone(i), 8_8)
 
  260       call cudamalloc_nvshmem(this%notifyReady(i), 8_8)
 
 
  272    call this%send_buf%free()
 
  273    call this%recv_buf%free()
 
  275    call this%free_order()
 
  276    call this%free_dofs()
 
  278#if defined(HAVE_HIP) || defined(HAVE_CUDA) 
  279    if (
allocated(this%stream)) 
then 
  280       do i = 1, 
size(this%stream)
 
  283       deallocate(this%stream)
 
 
  292    integer, 
intent(in) :: n
 
  293    real(kind=
rp), 
dimension(n), 
intent(inout) :: u
 
  294    type(c_ptr), 
intent(inout) :: deps
 
  295    type(c_ptr), 
intent(inout) :: strm
 
  301    do i = 1, 
size(this%send_pe)
 
 
  324    integer, 
intent(in) :: n
 
  325    real(kind=
rp), 
dimension(n), 
intent(inout) :: u
 
  326    type(c_ptr), 
intent(inout) :: strm
 
  327    integer :: op, done_req, i
 
  332    do i = 1, 
size(this%send_pe)
 
  333       if (this%recv_buf%remote_offset(i) .eq. -1) 
then 
  334          call mpi_sendrecv(this%recv_buf%offset(i), 1, mpi_integer, &
 
  335               this%recv_pe(i), 0, &
 
  336               this%recv_buf%remote_offset(i), 1, mpi_integer, &
 
  337               this%send_pe(i), 0, 
neko_comm, mpi_status_ignore)
 
  340       call cuda_gs_pack_and_push(u_d, &
 
  341            this%send_buf%buf_d, &
 
  342            this%send_buf%dof_d, &
 
  343            this%send_buf%offset(i), &
 
  344            this%send_buf%ndofs(i), &
 
  347            this%recv_buf%buf_d, &
 
  348            this%recv_buf%offset(i), &
 
  349            this%recv_buf%remote_offset, &
 
  351            this%nvshmem_counter, &
 
  352            this%notifyDone(i), &
 
  353            this%notifyReady(i), &
 
  355       this%nvshmem_counter = this%nvshmem_counter + 1
 
  358    do i = 1, 
size(this%send_pe)
 
  359       call cuda_gs_pack_and_push_wait(this%stream(i), &
 
  360            this%nvshmem_counter - 
size(this%send_pe) + i - 1, &
 
  364    do done_req = 1, 
size(this%recv_pe)
 
  366            this%recv_buf%buf_d, &
 
  367            this%recv_buf%dof_d, &
 
  368            this%recv_buf%offset(done_req), &
 
  369            this%recv_buf%ndofs(done_req), &
 
  370            this%stream(done_req))
 
  375    do done_req = 1, 
size(this%recv_pe)
 
  377            this%event(done_req), 0)
 
 
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
 
Return the device pointer for an associated Fortran array.
 
Copy data between host and device (or device and device)
 
Synchronize a device or stream.
 
type(mpi_comm), public neko_comm
MPI communicator.
 
Device abstraction, common interface for various accelerators.
 
subroutine, public device_event_record(event, stream)
Record a device event.
 
integer, parameter, public host_to_device
 
subroutine, public device_free(x_d)
Deallocate memory on the device.
 
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
 
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
 
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
 
subroutine, public device_event_create(event, flags)
Create a device event queue.
 
integer, public strm_high_prio
High priority stream setting.
 
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
 
Defines a gather-scatter communication method.
 
Defines GPU aware MPI gather-scatter communication.
 
subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
 
subroutine gs_device_shmem_nbrecv(this)
Post non-blocking receive operations.
 
subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
 
subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
 
subroutine gs_device_shmem_free(this)
Deallocate MPI based communication method.
 
subroutine gs_device_shmem_buf_free(this)
 
subroutine gs_device_shmem_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
 
Defines Gather-scatter operations.
 
Implements a hash table ADT.
 
integer, parameter, public c_rp
 
integer, parameter, public rp
Global precision used in computations.
 
Implements a dynamic stack ADT.
 
Gather-scatter communication method.
 
Buffers for non-blocking communication and packing/unpacking.
 
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
 
Integer based hash table.