43 use,
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
44 c_ptr, c_null_ptr, c_size_t, c_associated, c_int
50 integer,
allocatable :: ndofs(:)
51 integer,
allocatable :: offset(:)
53 type(c_ptr) :: buf_d = c_null_ptr
54 type(c_ptr) :: dof_d = c_null_ptr
65 type(c_ptr),
allocatable :: stream(:)
66 type(c_ptr),
allocatable :: event(:)
68 type(c_ptr) :: send_event = c_null_ptr
79 subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
80 bind(c, name=
'hip_gs_pack')
81 use,
intrinsic :: iso_c_binding
83 integer(c_int),
value :: n, offset
84 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
90 bind(c, name=
'hip_gs_unpack')
91 use,
intrinsic :: iso_c_binding
93 integer(c_int),
value :: op, offset, n
94 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
99 subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
100 bind(c, name=
'cuda_gs_pack')
101 use,
intrinsic :: iso_c_binding
103 integer(c_int),
value :: n, offset
104 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
109 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
110 bind(c, name=
'cuda_gs_unpack')
111 use,
intrinsic :: iso_c_binding
113 integer(c_int),
value :: op, offset, n
114 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
121 rbuf_d, roffset, rcount, rrank, nbytes, stream) &
122 bind(c, name=
'device_nccl_sendrecv')
123 use,
intrinsic :: iso_c_binding
125 integer(c_int),
value :: soffset, scount, roffset, rcount
126 integer(c_int),
value :: srank, rrank, nbytes
127 type(c_ptr),
value :: sbuf_d, rbuf_d, stream
135 integer,
allocatable,
intent(inout) :: pe_order(:)
136 type(
stack_i4_t),
allocatable,
intent(inout) :: dof_stack(:)
137 logical,
intent(in) :: mark_dupes
138 integer,
allocatable :: dofs(:)
139 integer :: i, j, total
140 integer(c_size_t) :: sz
142 integer :: dupe, marked, k
143 real(c_rp) :: rp_dummy
144 integer(c_int32_t) :: i4_dummy
147 allocate(this%ndofs(
size(pe_order)))
148 allocate(this%offset(
size(pe_order)))
151 do i = 1,
size(pe_order)
152 this%ndofs(i) = dof_stack(pe_order(i))%size()
153 this%offset(i) = total
154 total = total + this%ndofs(i)
159 sz = c_sizeof(rp_dummy) * total
162 sz = c_sizeof(i4_dummy) * total
165 if (mark_dupes)
call doftable%init(2*total)
166 allocate(dofs(total))
170 do i = 1,
size(pe_order)
172 select type (arr => dof_stack(pe_order(i))%data)
174 do j = 1, this%ndofs(i)
175 k = this%offset(i) + j
177 if (doftable%get(arr(j), dupe) .eq. 0)
then
178 if (dofs(dupe) .gt. 0)
then
179 dofs(dupe) = -dofs(dupe)
185 call doftable%set(arr(j), k)
205 if (
allocated(this%ndofs))
deallocate(this%ndofs)
206 if (
allocated(this%offset))
deallocate(this%offset)
208 if (c_associated(this%buf_d))
call device_free(this%buf_d)
209 if (c_associated(this%dof_d))
call device_free(this%dof_d)
219 call this%init_order(send_pe, recv_pe)
221 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
222 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
224#if defined(HAVE_HIP) || defined(HAVE_CUDA)
226 allocate(this%stream(
size(this%recv_pe)))
227 do i = 1,
size(this%recv_pe)
231 allocate(this%event(
size(this%recv_pe)))
232 do i = 1,
size(this%recv_pe)
244 call this%send_buf%free()
245 call this%recv_buf%free()
247 call this%free_order()
248 call this%free_dofs()
250#if defined(HAVE_HIP) || defined(HAVE_CUDA)
251 if (
allocated(this%stream))
then
252 do i = 1,
size(this%stream)
255 deallocate(this%stream)
264 integer,
intent(in) :: n
265 real(kind=
rp),
dimension(n),
intent(inout) :: u
266 type(c_ptr),
intent(inout) :: deps
267 type(c_ptr),
intent(inout) :: strm
273 do i = 1,
size(this%send_pe)
277 this%send_buf%buf_d, &
278 this%send_buf%dof_d, &
279 this%send_buf%offset(i), &
280 this%send_buf%ndofs(i), &
284 this%send_buf%buf_d, &
285 this%send_buf%dof_d, &
286 this%send_buf%offset(i), &
287 this%send_buf%ndofs(i), &
311 integer,
intent(in) :: n
312 real(kind=
rp),
dimension(n),
intent(inout) :: u
313 type(c_ptr),
intent(inout) :: strm
314 integer :: op, done_req, i
316 real(c_rp) :: rp_dummy
317 integer(c_int) :: nbytes
320 nbytes = c_sizeof(rp_dummy)
322 do i = 1,
size(this%send_pe)
325 nbytes*this%send_buf%offset(i), &
326 this%send_buf%ndofs(i), &
328 this%recv_buf%buf_d, &
329 nbytes*this%recv_buf%offset(i), &
330 this%recv_buf%ndofs(i), &
337 this%recv_buf%buf_d, &
338 this%recv_buf%dof_d, &
339 this%recv_buf%offset(i), &
340 this%recv_buf%ndofs(i), &
344 this%recv_buf%buf_d, &
345 this%recv_buf%dof_d, &
346 this%recv_buf%offset(i), &
347 this%recv_buf%ndofs(i), &
356 do done_req = 1,
size(this%recv_pe)
358 this%event(done_req), 0)
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Copy data between host and device (or device and device)
integer, public pe_size
MPI size of communicator.
integer, public pe_rank
MPI rank.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
integer, public strm_high_prio
High priority stream setting.
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Defines a gather-scatter communication method.
Defines NCCL based gather-scatter communication.
subroutine gs_device_nccl_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_nccl_init(this, send_pe, recv_pe)
Initialise NCCL based communication method.
subroutine gs_device_nccl_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_nccl_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_nccl_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_nccl_free(this)
Deallocate MPI based communication method.
subroutine gs_device_nccl_buf_free(this)
Defines Gather-scatter operations.
Implements a hash table ADT.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Implements a dynamic stack ADT.
Gather-scatter communication method.
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Integer based hash table.