46 use,
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
47 c_ptr, c_null_ptr, c_size_t, c_associated
53 integer,
allocatable :: ndofs(:)
54 integer,
allocatable :: offset(:)
56 type(c_ptr) :: reqs = c_null_ptr
57 type(c_ptr) :: buf_d = c_null_ptr
58 type(c_ptr) :: dof_d = c_null_ptr
69 type(c_ptr),
allocatable :: stream(:)
70 type(c_ptr),
allocatable :: event(:)
72 type(c_ptr) :: send_event = c_null_ptr
83 subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
84 bind(c, name=
'hip_gs_pack')
85 use,
intrinsic :: iso_c_binding
87 integer(c_int),
value :: n, offset
88 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
94 bind(c, name=
'hip_gs_unpack')
95 use,
intrinsic :: iso_c_binding
97 integer(c_int),
value :: op, offset, n
98 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
103 subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
104 bind(c, name=
'cuda_gs_pack')
105 use,
intrinsic :: iso_c_binding
107 integer(c_int),
value :: n, offset
108 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
113 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
114 bind(c, name=
'cuda_gs_unpack')
115 use,
intrinsic :: iso_c_binding
117 integer(c_int),
value :: op, offset, n
118 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
125 bind(c, name=
'device_mpi_init_reqs')
126 use,
intrinsic :: iso_c_binding
128 integer(c_int),
value :: n
135 bind(c, name=
'device_mpi_free_reqs')
136 use,
intrinsic :: iso_c_binding
144 bind(c, name=
'device_mpi_isend')
145 use,
intrinsic :: iso_c_binding
147 integer(c_int),
value :: offset, nbytes, rank, i
148 type(c_ptr),
value :: buf_d, reqs
154 bind(c, name=
'device_mpi_irecv')
155 use,
intrinsic :: iso_c_binding
157 integer(c_int),
value :: offset, nbytes, rank, i
158 type(c_ptr),
value :: buf_d, reqs
164 bind(c, name=
'device_mpi_test')
165 use,
intrinsic :: iso_c_binding
167 integer(c_int),
value :: i
168 type(c_ptr),
value :: reqs
174 bind(c, name=
'device_mpi_waitall')
175 use,
intrinsic :: iso_c_binding
177 integer(c_int),
value :: n
178 type(c_ptr),
value :: reqs
184 bind(c, name=
'device_mpi_waitany')
185 use,
intrinsic :: iso_c_binding
187 integer(c_int),
value :: n
189 type(c_ptr),
value :: reqs
197 integer,
allocatable,
intent(inout) :: pe_order(:)
198 type(
stack_i4_t),
allocatable,
intent(inout) :: dof_stack(:)
199 logical,
intent(in) :: mark_dupes
200 integer,
allocatable :: dofs(:)
201 integer :: i, j, total
202 integer(c_size_t) :: sz
204 integer :: dupe, marked, k
205 real(c_rp) :: rp_dummy
206 integer(c_int32_t) :: i4_dummy
210 allocate(this%ndofs(
size(pe_order)))
211 allocate(this%offset(
size(pe_order)))
214 do i = 1,
size(pe_order)
215 this%ndofs(i) = dof_stack(pe_order(i))%size()
216 this%offset(i) = total
217 total = total + this%ndofs(i)
222 sz = c_sizeof(rp_dummy) * total
226 sz = c_sizeof(i4_dummy) * total
229 if (mark_dupes)
call doftable%init(2*total)
230 allocate(dofs(total))
234 do i = 1,
size(pe_order)
236 select type (arr => dof_stack(pe_order(i))%data)
238 do j = 1, this%ndofs(i)
239 k = this%offset(i) + j
241 if (doftable%get(arr(j), dupe) .eq. 0)
then
242 if (dofs(dupe) .gt. 0)
then
243 dofs(dupe) = -dofs(dupe)
249 call doftable%set(arr(j), k)
273 if (
allocated(this%ndofs))
deallocate(this%ndofs)
274 if (
allocated(this%offset))
deallocate(this%offset)
276 if (c_associated(this%buf_d))
call device_free(this%buf_d)
277 if (c_associated(this%dof_d))
call device_free(this%dof_d)
287 call this%init_order(send_pe, recv_pe)
289 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
290 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
292#if defined(HAVE_HIP) || defined(HAVE_CUDA)
294 allocate(this%stream(
size(this%recv_pe)))
295 do i = 1,
size(this%recv_pe)
299 allocate(this%event(
size(this%recv_pe)))
300 do i = 1,
size(this%recv_pe)
315 call this%send_buf%free()
316 call this%recv_buf%free()
318 call this%free_order()
319 call this%free_dofs()
321#if defined(HAVE_HIP) || defined(HAVE_CUDA)
322 if (
allocated(this%stream))
then
323 do i = 1,
size(this%stream)
326 deallocate(this%stream)
335 integer,
intent(in) :: n
336 real(kind=
rp),
dimension(n),
intent(inout) :: u
337 type(c_ptr),
intent(inout) :: deps
338 type(c_ptr),
intent(inout) :: strm
344 if (iand(this%nb_strtgy, 1) .eq. 0)
then
348 this%send_buf%buf_d, &
349 this%send_buf%dof_d, &
350 0, this%send_buf%total, &
354 this%send_buf%buf_d, &
355 this%send_buf%dof_d, &
356 0, this%send_buf%total, &
364 do i = 1,
size(this%send_pe)
366 rp*this%send_buf%offset(i), &
367 rp*this%send_buf%ndofs(i), this%send_pe(i), &
368 this%send_buf%reqs, i)
373 do i = 1,
size(this%send_pe)
377 this%send_buf%buf_d, &
378 this%send_buf%dof_d, &
379 this%send_buf%offset(i), &
380 this%send_buf%ndofs(i), &
384 this%send_buf%buf_d, &
385 this%send_buf%dof_d, &
386 this%send_buf%offset(i), &
387 this%send_buf%ndofs(i), &
395 do i = 1,
size(this%send_pe)
398 rp*this%send_buf%offset(i), &
399 rp*this%send_buf%ndofs(i), this%send_pe(i), &
400 this%send_buf%reqs, i)
411 do i = 1,
size(this%recv_pe)
413 rp*this%recv_buf%ndofs(i), this%recv_pe(i), &
414 this%recv_buf%reqs, i)
422 integer,
intent(in) :: n
423 real(kind=
rp),
dimension(n),
intent(inout) :: u
424 type(c_ptr),
intent(inout) :: strm
425 integer :: op, done_req, i
430 if (iand(this%nb_strtgy, 2) .eq. 0)
then
435 this%recv_buf%buf_d, &
436 this%recv_buf%dof_d, &
437 0, this%recv_buf%total, &
441 this%recv_buf%buf_d, &
442 this%recv_buf%dof_d, &
443 0, this%recv_buf%total, &
457 this%recv_buf%reqs, done_req) .ne. 0)
461 this%recv_buf%buf_d, &
462 this%recv_buf%dof_d, &
463 this%recv_buf%offset(done_req), &
464 this%recv_buf%ndofs(done_req), &
465 this%stream(done_req))
468 this%recv_buf%buf_d, &
469 this%recv_buf%dof_d, &
470 this%recv_buf%offset(done_req), &
471 this%recv_buf%ndofs(done_req), &
472 this%stream(done_req))
482 do done_req = 1,
size(this%recv_pe)
484 this%event(done_req), 0)
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Copy data between host and device (or device and device)
Synchronize a device or stream.
integer, public pe_size
MPI size of communicator.
integer, public pe_rank
MPI rank.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
integer, public strm_high_prio
High priority stream setting.
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Defines a gather-scatter communication method.
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_mpi_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_mpi_free(this)
Deallocate MPI based communication method.
subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_mpi_buf_free(this)
subroutine gs_device_mpi_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Implements a hash table ADT.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Implements a dynamic stack ADT.
Gather-scatter communication method.
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Integer based hash table.