43  use, 
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t
 
   49     integer, 
allocatable :: ndofs(:)
 
   50     integer, 
allocatable :: offset(:)
 
   52     type(c_ptr) :: reqs = c_null_ptr           
 
   53     type(c_ptr) :: buf_d = c_null_ptr          
 
   54     type(c_ptr) :: dof_d = c_null_ptr          
 
 
   65     type(c_ptr), 
allocatable :: stream(:)
 
   66     type(c_ptr), 
allocatable :: event(:)
 
   68     type(c_ptr) :: send_event = c_null_ptr
 
 
   79     subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
 
   80          bind(c, name=
'hip_gs_pack')
 
   81       use, 
intrinsic :: iso_c_binding
 
   83       integer(c_int), 
value :: n, offset
 
   84       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream
 
 
   90          bind(c, name=
'hip_gs_unpack')
 
   91       use, 
intrinsic :: iso_c_binding
 
   93       integer(c_int), 
value :: op, offset, n
 
   94       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream
 
 
   99     subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
 
  100          bind(c, name=
'cuda_gs_pack')
 
  101       use, 
intrinsic :: iso_c_binding
 
  103       integer(c_int), 
value :: n, offset
 
  104       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream
 
  109     subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
 
  110          bind(c, name=
'cuda_gs_unpack')
 
  111       use, 
intrinsic :: iso_c_binding
 
  113       integer(c_int), 
value :: op, offset, n
 
  114       type(c_ptr), 
value :: u_d, buf_d, dof_d, stream
 
  121          bind(c, name=
'device_mpi_init_reqs')
 
  122       use, 
intrinsic :: iso_c_binding
 
  124       integer(c_int), 
value :: n
 
 
  131          bind(c, name=
'device_mpi_free_reqs')
 
  132       use, 
intrinsic :: iso_c_binding
 
 
  140          bind(c, name=
'device_mpi_isend')
 
  141       use, 
intrinsic :: iso_c_binding
 
  143       integer(c_int), 
value :: offset, nbytes, rank, i
 
  144       type(c_ptr), 
value :: buf_d, reqs
 
 
  150          bind(c, name=
'device_mpi_irecv')
 
  151       use, 
intrinsic :: iso_c_binding
 
  153       integer(c_int), 
value :: offset, nbytes, rank, i
 
  154       type(c_ptr), 
value :: buf_d, reqs
 
 
  160          bind(c, name=
'device_mpi_test')
 
  161       use, 
intrinsic :: iso_c_binding
 
  163       integer(c_int), 
value :: i
 
  164       type(c_ptr), 
value :: reqs
 
 
  170          bind(c, name=
'device_mpi_waitall')
 
  171       use, 
intrinsic :: iso_c_binding
 
  173       integer(c_int), 
value :: n
 
  174       type(c_ptr), 
value :: reqs
 
 
  180          bind(c, name=
'device_mpi_waitany')
 
  181       use, 
intrinsic :: iso_c_binding
 
  183       integer(c_int), 
value :: n
 
  185       type(c_ptr), 
value :: reqs
 
 
  193    integer, 
allocatable, 
intent(inout) :: pe_order(:)
 
  194    type(
stack_i4_t), 
allocatable, 
intent(inout) :: dof_stack(:)
 
  195    logical, 
intent(in) :: mark_dupes
 
  196    integer, 
allocatable :: dofs(:)
 
  197    integer :: i, j, total
 
  198    integer(c_size_t) :: sz
 
  200    integer :: dupe, marked, k
 
  201    real(c_rp) :: rp_dummy
 
  202    integer(c_int32_t) :: i4_dummy
 
  206    allocate(this%ndofs(
size(pe_order)))
 
  207    allocate(this%offset(
size(pe_order)))
 
  210    do i = 1, 
size(pe_order)
 
  211       this%ndofs(i) = dof_stack(pe_order(i))%size()
 
  212       this%offset(i) = total
 
  213       total = total + this%ndofs(i)
 
  218    sz = c_sizeof(rp_dummy) * total
 
  221    sz = c_sizeof(i4_dummy) * total
 
  224    if (mark_dupes) 
call doftable%init(2*total)
 
  225    allocate(dofs(total))
 
  229    do i = 1, 
size(pe_order)
 
  231       select type (arr => dof_stack(pe_order(i))%data)
 
  233          do j = 1, this%ndofs(i)
 
  234             k = this%offset(i) + j
 
  236                if (doftable%get(arr(j), dupe) .eq. 0) 
then 
  237                   if (dofs(dupe) .gt. 0) 
then 
  238                      dofs(dupe) = -dofs(dupe)
 
  244                   call doftable%set(arr(j), k)
 
 
  266    if (
allocated(this%ndofs)) 
deallocate(this%ndofs)
 
  267    if (
allocated(this%offset)) 
deallocate(this%offset)
 
  269    if (c_associated(this%buf_d)) 
call device_free(this%buf_d)
 
  270    if (c_associated(this%dof_d)) 
call device_free(this%dof_d)
 
 
  280    call this%init_order(send_pe, recv_pe)
 
  282    call this%send_buf%init(this%send_pe, this%send_dof, .false.)
 
  283    call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
 
  285#if defined(HAVE_HIP) || defined(HAVE_CUDA) 
  287    allocate(this%stream(
size(this%recv_pe)))
 
  288    do i = 1, 
size(this%recv_pe)
 
  292    allocate(this%event(
size(this%recv_pe)))
 
  293    do i = 1, 
size(this%recv_pe)
 
 
  308    call this%send_buf%free()
 
  309    call this%recv_buf%free()
 
  311    call this%free_order()
 
  312    call this%free_dofs()
 
  314#if defined(HAVE_HIP) || defined(HAVE_CUDA) 
  315    if (
allocated(this%stream)) 
then 
  316       do i = 1, 
size(this%stream)
 
  319       deallocate(this%stream)
 
 
  328    integer, 
intent(in) :: n
 
  329    real(kind=
rp), 
dimension(n), 
intent(inout) :: u
 
  330    type(c_ptr), 
intent(inout) :: deps
 
  331    type(c_ptr), 
intent(inout) :: strm
 
  337    if (iand(this%nb_strtgy, 1) .eq. 0) 
then 
  341                        this%send_buf%buf_d, &
 
  342                        this%send_buf%dof_d, &
 
  343                        0, this%send_buf%total, &
 
  347                         this%send_buf%buf_d, &
 
  348                         this%send_buf%dof_d, &
 
  349                         0, this%send_buf%total, &
 
  357       do i = 1, 
size(this%send_pe)
 
  359                                rp*this%send_buf%offset(i), &
 
  360                                rp*this%send_buf%ndofs(i), this%send_pe(i), &
 
  361                                this%send_buf%reqs, i)
 
  366       do i = 1, 
size(this%send_pe)
 
  370                           this%send_buf%buf_d, &
 
  371                           this%send_buf%dof_d, &
 
  372                           this%send_buf%offset(i), &
 
  373                           this%send_buf%ndofs(i), &
 
  377                            this%send_buf%buf_d, &
 
  378                            this%send_buf%dof_d, &
 
  379                            this%send_buf%offset(i), &
 
  380                            this%send_buf%ndofs(i), &
 
  388       do i = 1, 
size(this%send_pe)
 
  391                                rp*this%send_buf%offset(i), &
 
  392                                rp*this%send_buf%ndofs(i), this%send_pe(i), &
 
  393                                this%send_buf%reqs, i)
 
 
  404    do i = 1, 
size(this%recv_pe)
 
  406                             rp*this%recv_buf%ndofs(i), this%recv_pe(i), &
 
  407                             this%recv_buf%reqs, i)
 
 
  415    integer, 
intent(in) :: n
 
  416    real(kind=
rp), 
dimension(n), 
intent(inout) :: u
 
  417    type(c_ptr), 
intent(inout) :: strm
 
  418    integer :: op, done_req, i
 
  423    if (iand(this%nb_strtgy, 2) .eq. 0) 
then 
  428                          this%recv_buf%buf_d, &
 
  429                          this%recv_buf%dof_d, &
 
  430                          0, this%recv_buf%total, &
 
  434                           this%recv_buf%buf_d, &
 
  435                           this%recv_buf%dof_d, &
 
  436                           0, this%recv_buf%total, &
 
  450                                  this%recv_buf%reqs, done_req) .ne. 0)
 
  454                             this%recv_buf%buf_d, &
 
  455                             this%recv_buf%dof_d, &
 
  456                             this%recv_buf%offset(done_req), &
 
  457                             this%recv_buf%ndofs(done_req), &
 
  458                             this%stream(done_req))
 
  461                              this%recv_buf%buf_d, &
 
  462                              this%recv_buf%dof_d, &
 
  463                              this%recv_buf%offset(done_req), &
 
  464                              this%recv_buf%ndofs(done_req), &
 
  465                              this%stream(done_req))
 
  475       do done_req = 1, 
size(this%recv_pe)
 
  477               this%event(done_req), 0)
 
 
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
 
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
 
Return the device pointer for an associated Fortran array.
 
Copy data between host and device (or device and device)
 
Synchronize a device or stream.
 
integer pe_size
MPI size of communicator.
 
Device abstraction, common interface for various accelerators.
 
subroutine, public device_event_record(event, stream)
Record a device event.
 
integer, parameter, public host_to_device
 
subroutine, public device_free(x_d)
Deallocate memory on the device.
 
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
 
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
 
subroutine device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
 
subroutine, public device_event_create(event, flags)
Create a device event queue.
 
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
 
Defines a gather-scatter communication method.
 
Defines GPU aware MPI gather-scatter communication.
 
subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
 
subroutine gs_device_mpi_nbrecv(this)
Post non-blocking receive operations.
 
subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
 
subroutine gs_device_mpi_free(this)
Deallocate MPI based communication method.
 
subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
 
subroutine gs_device_mpi_buf_free(this)
 
subroutine gs_device_mpi_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
 
Defines Gather-scatter operations.
 
Implements a hash table ADT.
 
integer, parameter, public c_rp
 
integer, parameter, public rp
Global precision used in computations.
 
Implements a dynamic stack ADT.
 
Gather-scatter communication method.
 
Buffers for non-blocking communication and packing/unpacking.
 
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
 
Integer based hash table.