42 use mpi_f08,
only : mpi_allreduce, mpi_integer, &
43 mpi_max, mpi_sendrecv, mpi_status_ignore
45 use,
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
46 c_ptr, c_null_ptr, c_size_t, c_associated
52 integer,
allocatable :: ndofs(:)
53 integer,
allocatable :: offset(:)
54 integer,
allocatable :: remote_offset(:)
56 type(c_ptr) :: buf_d = c_null_ptr
57 type(c_ptr) :: dof_d = c_null_ptr
68 type(c_ptr),
allocatable :: stream(:)
69 type(c_ptr),
allocatable :: event(:)
70 integer :: nvshmem_counter = 1
71 type(c_ptr),
allocatable :: notifydone(:)
72 type(c_ptr),
allocatable :: notifyready(:)
82#if defined (HAVE_CUDA) && defined(HAVE_NVSHMEM)
85 subroutine cudamalloc_nvshmem(ptr, size) &
86 bind(c, name=
'cudamalloc_nvshmem')
87 use,
intrinsic :: iso_c_binding
90 integer(c_size_t),
value :: size
91 end subroutine cudamalloc_nvshmem
95 subroutine cudafree_nvshmem(ptr) &
96 bind(c, name=
'cudafree_nvshmem')
97 use,
intrinsic :: iso_c_binding
100 end subroutine cudafree_nvshmem
104 subroutine cuda_gs_pack_and_push(u_d, buf_d, dof_d, offset, n, stream, &
105 srank, rbuf_d, roffset, remote_offset, &
106 rrank, nvshmem_counter, notifyDone, &
108 bind(c, name=
'cuda_gs_pack_and_push')
109 use,
intrinsic :: iso_c_binding
111 integer(c_int),
value :: n, offset, srank, roffset, rrank, iter
112 integer(c_int),
value :: nvshmem_counter
113 type(c_ptr),
value :: u_d, buf_d, dof_d, stream, rbuf_d, notifydone, notifyready
114 integer(c_int),
dimension(*) :: remote_offset
115 end subroutine cuda_gs_pack_and_push
119 subroutine cuda_gs_pack_and_push_wait(stream, nvshmem_counter, notifyDone) &
120 bind(c, name=
'cuda_gs_pack_and_push_wait')
121 use,
intrinsic :: iso_c_binding
123 integer(c_int),
value :: nvshmem_counter
124 type(c_ptr),
value :: stream, notifydone
125 end subroutine cuda_gs_pack_and_push_wait
129 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
130 bind(c, name=
'cuda_gs_unpack')
131 use,
intrinsic :: iso_c_binding
133 integer(c_int),
value :: op, offset, n
134 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
143 integer,
allocatable,
intent(inout) :: pe_order(:)
144 type(
stack_i4_t),
allocatable,
intent(inout) :: dof_stack(:)
145 logical,
intent(in) :: mark_dupes
146 integer,
allocatable :: dofs(:)
147 integer :: i, j, total, max_total
148 integer(c_size_t) :: sz
150 integer :: dupe, marked, k
151 real(c_rp) :: rp_dummy
152 integer(c_int32_t) :: i4_dummy
154 allocate(this%ndofs(
size(pe_order)))
155 allocate(this%offset(
size(pe_order)))
156 allocate(this%remote_offset(
size(pe_order)))
158 do i = 1,
size(pe_order)
159 this%remote_offset(i)=-1
163 do i = 1,
size(pe_order)
164 this%ndofs(i) = dof_stack(pe_order(i))%size()
165 this%offset(i) = total
166 total = total + this%ndofs(i)
169 call mpi_allreduce(total, max_total, 1, mpi_integer, mpi_max,
neko_comm)
173 sz = c_sizeof(rp_dummy) * max_total
175 call cudamalloc_nvshmem(this%buf_d, sz)
178 sz = c_sizeof(i4_dummy) * total
181 if (mark_dupes)
call doftable%init(2*total)
182 allocate(dofs(total))
186 do i = 1,
size(pe_order)
188 select type (arr => dof_stack(pe_order(i))%data)
190 do j = 1, this%ndofs(i)
191 k = this%offset(i) + j
193 if (doftable%get(arr(j), dupe) .eq. 0)
then
194 if (dofs(dupe) .gt. 0)
then
195 dofs(dupe) = -dofs(dupe)
201 call doftable%set(arr(j), k)
222 if (
allocated(this%ndofs))
deallocate(this%ndofs)
223 if (
allocated(this%offset))
deallocate(this%offset)
226 if (c_associated(this%buf_d))
call cudafree_nvshmem(this%buf_d)
228 if (c_associated(this%dof_d))
call device_free(this%dof_d)
239 call this%init_order(send_pe, recv_pe)
241 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
242 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
244#if defined(HAVE_HIP) || defined(HAVE_CUDA)
246 allocate(this%stream(
size(this%recv_pe)))
247 do i = 1,
size(this%recv_pe)
251 allocate(this%event(
size(this%recv_pe)))
252 do i = 1,
size(this%recv_pe)
257 allocate(this%notifyDone(
size(this%recv_pe)))
258 allocate(this%notifyReady(
size(this%recv_pe)))
259 do i = 1,
size(this%recv_pe)
260 call cudamalloc_nvshmem(this%notifyDone(i), 8_8)
261 call cudamalloc_nvshmem(this%notifyReady(i), 8_8)
273 call this%send_buf%free()
274 call this%recv_buf%free()
276 call this%free_order()
277 call this%free_dofs()
279#if defined(HAVE_HIP) || defined(HAVE_CUDA)
280 if (
allocated(this%stream))
then
281 do i = 1,
size(this%stream)
284 deallocate(this%stream)
293 integer,
intent(in) :: n
294 real(kind=
rp),
dimension(n),
intent(inout) :: u
295 type(c_ptr),
intent(inout) :: deps
296 type(c_ptr),
intent(inout) :: strm
302 do i = 1,
size(this%send_pe)
325 integer,
intent(in) :: n
326 real(kind=
rp),
dimension(n),
intent(inout) :: u
327 type(c_ptr),
intent(inout) :: strm
328 integer :: op, done_req, i
333 do i = 1,
size(this%send_pe)
334 if (this%recv_buf%remote_offset(i) .eq. -1)
then
335 call mpi_sendrecv(this%recv_buf%offset(i), 1, mpi_integer, &
336 this%recv_pe(i), 0, &
337 this%recv_buf%remote_offset(i), 1, mpi_integer, &
338 this%send_pe(i), 0,
neko_comm, mpi_status_ignore)
341 call cuda_gs_pack_and_push(u_d, &
342 this%send_buf%buf_d, &
343 this%send_buf%dof_d, &
344 this%send_buf%offset(i), &
345 this%send_buf%ndofs(i), &
348 this%recv_buf%buf_d, &
349 this%recv_buf%offset(i), &
350 this%recv_buf%remote_offset, &
352 this%nvshmem_counter, &
353 this%notifyDone(i), &
354 this%notifyReady(i), &
356 this%nvshmem_counter = this%nvshmem_counter + 1
359 do i = 1,
size(this%send_pe)
360 call cuda_gs_pack_and_push_wait(this%stream(i), &
361 this%nvshmem_counter -
size(this%send_pe) + i - 1, &
365 do done_req = 1,
size(this%recv_pe)
367 this%recv_buf%buf_d, &
368 this%recv_buf%dof_d, &
369 this%recv_buf%offset(done_req), &
370 this%recv_buf%ndofs(done_req), &
371 this%stream(done_req))
376 do done_req = 1,
size(this%recv_pe)
378 this%event(done_req), 0)
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Copy data between host and device (or device and device)
Synchronize a device or stream.
integer, public pe_size
MPI size of communicator.
integer, public pe_rank
MPI rank.
type(mpi_comm), public neko_comm
MPI communicator.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
integer, public strm_high_prio
High priority stream setting.
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Defines a gather-scatter communication method.
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_shmem_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_shmem_free(this)
Deallocate MPI based communication method.
subroutine gs_device_shmem_buf_free(this)
subroutine gs_device_shmem_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Defines Gather-scatter operations.
Implements a hash table ADT.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Implements a dynamic stack ADT.
Gather-scatter communication method.
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Integer based hash table.