41 use mpi_f08,
only : mpi_allreduce, mpi_integer, &
42 mpi_max, mpi_sendrecv, mpi_status_ignore
44 use,
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
45 c_ptr, c_null_ptr, c_size_t, c_associated
51 integer,
allocatable :: ndofs(:)
52 integer,
allocatable :: offset(:)
53 integer,
allocatable :: remote_offset(:)
55 type(c_ptr) :: buf_d = c_null_ptr
56 type(c_ptr) :: dof_d = c_null_ptr
67 type(c_ptr),
allocatable :: stream(:)
68 type(c_ptr),
allocatable :: event(:)
69 integer :: nvshmem_counter = 1
70 type(c_ptr),
allocatable :: notifydone(:)
71 type(c_ptr),
allocatable :: notifyready(:)
81#if defined (HAVE_CUDA) && defined(HAVE_NVSHMEM)
84 subroutine cudamalloc_nvshmem(ptr, size) &
85 bind(c, name =
'cudamalloc_nvshmem')
86 use,
intrinsic :: iso_c_binding
89 integer(c_size_t),
value :: size
90 end subroutine cudamalloc_nvshmem
94 subroutine cudafree_nvshmem(ptr) &
95 bind(c, name =
'cudafree_nvshmem')
96 use,
intrinsic :: iso_c_binding
99 end subroutine cudafree_nvshmem
103 subroutine cuda_gs_pack_and_push(u_d, buf_d, dof_d, offset, n, stream, &
104 srank, rbuf_d, roffset, remote_offset, &
105 rrank, nvshmem_counter, notifyDone, &
107 bind(c, name =
'cuda_gs_pack_and_push')
108 use,
intrinsic :: iso_c_binding
110 integer(c_int),
value :: n, offset, srank, roffset, rrank, iter
111 integer(c_int),
value :: nvshmem_counter
112 type(c_ptr),
value :: u_d, buf_d, dof_d, stream, rbuf_d, notifydone, &
114 integer(c_int),
dimension(*) :: remote_offset
115 end subroutine cuda_gs_pack_and_push
119 subroutine cuda_gs_pack_and_push_wait(stream, nvshmem_counter, &
120 notifyDone) bind(c, name = 'cuda_gs_pack_and_push_wait')
121 use,
intrinsic :: iso_c_binding
123 integer(c_int),
value :: nvshmem_counter
124 type(c_ptr),
value :: stream, notifydone
125 end subroutine cuda_gs_pack_and_push_wait
129 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
130 bind(c, name =
'cuda_gs_unpack')
131 use,
intrinsic :: iso_c_binding
133 integer(c_int),
value :: op, offset, n
134 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
143 integer,
allocatable,
intent(inout) :: pe_order(:)
144 type(
stack_i4_t),
allocatable,
intent(inout) :: dof_stack(:)
145 logical,
intent(in) :: mark_dupes
146 integer,
allocatable :: dofs(:)
147 integer :: i, j, total, max_total
148 integer(c_size_t) :: sz
150 integer :: dupe, marked, k
151 real(c_rp) :: rp_dummy
152 integer(c_int32_t) :: i4_dummy
154 allocate(this%ndofs(
size(pe_order)))
155 allocate(this%offset(
size(pe_order)))
156 allocate(this%remote_offset(
size(pe_order)))
158 do i = 1,
size(pe_order)
159 this%remote_offset(i) = -1
163 do i = 1,
size(pe_order)
164 this%ndofs(i) = dof_stack(pe_order(i))%size()
165 this%offset(i) = total
166 total = total + this%ndofs(i)
169 call mpi_allreduce(total, max_total, 1, mpi_integer, mpi_max,
neko_comm)
173 sz = c_sizeof(rp_dummy) * max_total
175 call cudamalloc_nvshmem(this%buf_d, sz)
178 sz = c_sizeof(i4_dummy) * total
181 if (mark_dupes)
call doftable%init(2*total)
182 allocate(dofs(total))
186 do i = 1,
size(pe_order)
188 select type (arr => dof_stack(pe_order(i))%data)
190 do j = 1, this%ndofs(i)
191 k = this%offset(i) + j
193 if (doftable%get(arr(j), dupe) .eq. 0)
then
194 if (dofs(dupe) .gt. 0)
then
195 dofs(dupe) = -dofs(dupe)
201 call doftable%set(arr(j), k)
222 if (
allocated(this%ndofs))
deallocate(this%ndofs)
223 if (
allocated(this%offset))
deallocate(this%offset)
226 if (c_associated(this%buf_d))
call cudafree_nvshmem(this%buf_d)
228 if (c_associated(this%dof_d))
call device_free(this%dof_d)
239 call this%init_order(send_pe, recv_pe)
241 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
242 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
244#if defined(HAVE_HIP) || defined(HAVE_CUDA)
246 allocate(this%stream(
size(this%recv_pe)))
247 do i = 1,
size(this%recv_pe)
252 allocate(this%event(
size(this%recv_pe)))
253 do i = 1,
size(this%recv_pe)
258 allocate(this%notifyDone(
size(this%recv_pe)))
259 allocate(this%notifyReady(
size(this%recv_pe)))
260 do i = 1,
size(this%recv_pe)
261 call cudamalloc_nvshmem(this%notifyDone(i), 8_8)
262 call cudamalloc_nvshmem(this%notifyReady(i), 8_8)
274 call this%send_buf%free()
275 call this%recv_buf%free()
277 call this%free_order()
278 call this%free_dofs()
280#if defined(HAVE_HIP) || defined(HAVE_CUDA)
281 if (
allocated(this%stream))
then
282 do i = 1,
size(this%stream)
285 deallocate(this%stream)
294 integer,
intent(in) :: n
295 real(kind=
rp),
dimension(n),
intent(inout) :: u
296 type(c_ptr),
intent(inout) :: deps
297 type(c_ptr),
intent(inout) :: strm
303 do i = 1,
size(this%send_pe)
326 integer,
intent(in) :: n
327 real(kind=
rp),
dimension(n),
intent(inout) :: u
328 type(c_ptr),
intent(inout) :: strm
329 integer :: op, done_req, i
334 do i = 1,
size(this%send_pe)
335 if (this%recv_buf%remote_offset(i) .eq. -1)
then
336 call mpi_sendrecv(this%recv_buf%offset(i), 1, mpi_integer, &
337 this%recv_pe(i), 0, &
338 this%recv_buf%remote_offset(i), 1, mpi_integer, &
339 this%send_pe(i), 0,
neko_comm, mpi_status_ignore)
342 call cuda_gs_pack_and_push(u_d, &
343 this%send_buf%buf_d, &
344 this%send_buf%dof_d, &
345 this%send_buf%offset(i), &
346 this%send_buf%ndofs(i), &
349 this%recv_buf%buf_d, &
350 this%recv_buf%offset(i), &
351 this%recv_buf%remote_offset, &
353 this%nvshmem_counter, &
354 this%notifyDone(i), &
355 this%notifyReady(i), &
357 this%nvshmem_counter = this%nvshmem_counter + 1
360 do i = 1,
size(this%send_pe)
361 call cuda_gs_pack_and_push_wait(this%stream(i), &
362 this%nvshmem_counter -
size(this%send_pe) + i - 1, &
366 do done_req = 1,
size(this%recv_pe)
368 this%recv_buf%buf_d, &
369 this%recv_buf%dof_d, &
370 this%recv_buf%offset(done_req), &
371 this%recv_buf%ndofs(done_req), &
372 this%stream(done_req))
377 do done_req = 1,
size(this%recv_pe)
379 this%event(done_req), 0)
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Copy data between host and device (or device and device)
Synchronize a device or stream.
integer, public pe_size
MPI size of communicator.
integer, public pe_rank
MPI rank.
type(mpi_comm), public neko_comm
MPI communicator.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
integer, public strm_high_prio
High priority stream setting.
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Defines a gather-scatter communication method.
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_shmem_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_shmem_free(this)
Deallocate MPI based communication method.
subroutine gs_device_shmem_buf_free(this)
subroutine gs_device_shmem_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Implements a hash table ADT.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Implements a dynamic stack ADT.
Gather-scatter communication method.
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Integer based hash table.