44 use,
intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
45 c_ptr, c_null_ptr, c_size_t, c_associated
51 integer,
allocatable :: ndofs(:)
52 integer,
allocatable :: offset(:)
53 integer,
allocatable :: remote_offset(:)
55 type(c_ptr) :: buf_d = c_null_ptr
56 type(c_ptr) :: dof_d = c_null_ptr
67 type(c_ptr),
allocatable :: stream(:)
68 type(c_ptr),
allocatable :: event(:)
69 integer :: nvshmem_counter = 1
70 type(c_ptr),
allocatable :: notifydone(:)
71 type(c_ptr),
allocatable :: notifyready(:)
81#if defined (HAVE_CUDA) && defined(HAVE_NVSHMEM)
84 subroutine cudamalloc_nvshmem(ptr, size) &
85 bind(c, name=
'cudamalloc_nvshmem')
86 use,
intrinsic :: iso_c_binding
89 integer(c_size_t),
value :: size
90 end subroutine cudamalloc_nvshmem
94 subroutine cudafree_nvshmem(ptr) &
95 bind(c, name=
'cudafree_nvshmem')
96 use,
intrinsic :: iso_c_binding
99 end subroutine cudafree_nvshmem
103 subroutine cuda_gs_pack_and_push(u_d, buf_d, dof_d, offset, n, stream, &
104 srank, rbuf_d, roffset, remote_offset, &
105 rrank, nvshmem_counter, notifyDone, &
107 bind(c, name=
'cuda_gs_pack_and_push')
108 use,
intrinsic :: iso_c_binding
110 integer(c_int),
value :: n, offset, srank, roffset, rrank, iter
111 integer(c_int),
value :: nvshmem_counter
112 type(c_ptr),
value :: u_d, buf_d, dof_d, stream, rbuf_d, notifydone, notifyready
113 integer(c_int),
dimension(*) :: remote_offset
114 end subroutine cuda_gs_pack_and_push
118 subroutine cuda_gs_pack_and_push_wait(stream, nvshmem_counter, notifyDone) &
119 bind(c, name=
'cuda_gs_pack_and_push_wait')
120 use,
intrinsic :: iso_c_binding
122 integer(c_int),
value :: nvshmem_counter
123 type(c_ptr),
value :: stream, notifydone
124 end subroutine cuda_gs_pack_and_push_wait
128 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
129 bind(c, name=
'cuda_gs_unpack')
130 use,
intrinsic :: iso_c_binding
132 integer(c_int),
value :: op, offset, n
133 type(c_ptr),
value :: u_d, buf_d, dof_d, stream
142 integer,
allocatable,
intent(inout) :: pe_order(:)
143 type(
stack_i4_t),
allocatable,
intent(inout) :: dof_stack(:)
144 logical,
intent(in) :: mark_dupes
145 integer,
allocatable :: dofs(:)
146 integer :: i, j, total, max_total
147 integer(c_size_t) :: sz
149 integer :: dupe, marked, k
150 real(c_rp) :: rp_dummy
151 integer(c_int32_t) :: i4_dummy
153 allocate(this%ndofs(
size(pe_order)))
154 allocate(this%offset(
size(pe_order)))
155 allocate(this%remote_offset(
size(pe_order)))
157 do i = 1,
size(pe_order)
158 this%remote_offset(i)=-1
162 do i = 1,
size(pe_order)
163 this%ndofs(i) = dof_stack(pe_order(i))%size()
164 this%offset(i) = total
165 total = total + this%ndofs(i)
168 call mpi_allreduce(total, max_total, 1, mpi_integer, mpi_max,
neko_comm)
172 sz = c_sizeof(rp_dummy) * max_total
174 call cudamalloc_nvshmem(this%buf_d, sz)
177 sz = c_sizeof(i4_dummy) * total
180 if (mark_dupes)
call doftable%init(2*total)
181 allocate(dofs(total))
185 do i = 1,
size(pe_order)
187 select type (arr => dof_stack(pe_order(i))%data)
189 do j = 1, this%ndofs(i)
190 k = this%offset(i) + j
192 if (doftable%get(arr(j), dupe) .eq. 0)
then
193 if (dofs(dupe) .gt. 0)
then
194 dofs(dupe) = -dofs(dupe)
200 call doftable%set(arr(j), k)
221 if (
allocated(this%ndofs))
deallocate(this%ndofs)
222 if (
allocated(this%offset))
deallocate(this%offset)
225 if (c_associated(this%buf_d))
call cudafree_nvshmem(this%buf_d)
227 if (c_associated(this%dof_d))
call device_free(this%dof_d)
238 call this%init_order(send_pe, recv_pe)
240 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
241 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
243#if defined(HAVE_HIP) || defined(HAVE_CUDA)
245 allocate(this%stream(
size(this%recv_pe)))
246 do i = 1,
size(this%recv_pe)
250 allocate(this%event(
size(this%recv_pe)))
251 do i = 1,
size(this%recv_pe)
256 allocate(this%notifyDone(
size(this%recv_pe)))
257 allocate(this%notifyReady(
size(this%recv_pe)))
258 do i = 1,
size(this%recv_pe)
259 call cudamalloc_nvshmem(this%notifyDone(i), 8_8)
260 call cudamalloc_nvshmem(this%notifyReady(i), 8_8)
272 call this%send_buf%free()
273 call this%recv_buf%free()
275 call this%free_order()
276 call this%free_dofs()
278#if defined(HAVE_HIP) || defined(HAVE_CUDA)
279 if (
allocated(this%stream))
then
280 do i = 1,
size(this%stream)
283 deallocate(this%stream)
292 integer,
intent(in) :: n
293 real(kind=
rp),
dimension(n),
intent(inout) :: u
294 type(c_ptr),
intent(inout) :: deps
295 type(c_ptr),
intent(inout) :: strm
301 do i = 1,
size(this%send_pe)
324 integer,
intent(in) :: n
325 real(kind=
rp),
dimension(n),
intent(inout) :: u
326 type(c_ptr),
intent(inout) :: strm
327 integer :: op, done_req, i
332 do i = 1,
size(this%send_pe)
333 if (this%recv_buf%remote_offset(i) .eq. -1)
then
334 call mpi_sendrecv(this%recv_buf%offset(i), 1, mpi_integer, &
335 this%recv_pe(i), 0, &
336 this%recv_buf%remote_offset(i), 1, mpi_integer, &
337 this%send_pe(i), 0,
neko_comm, mpi_status_ignore)
340 call cuda_gs_pack_and_push(u_d, &
341 this%send_buf%buf_d, &
342 this%send_buf%dof_d, &
343 this%send_buf%offset(i), &
344 this%send_buf%ndofs(i), &
347 this%recv_buf%buf_d, &
348 this%recv_buf%offset(i), &
349 this%recv_buf%remote_offset, &
351 this%nvshmem_counter, &
352 this%notifyDone(i), &
353 this%notifyReady(i), &
355 this%nvshmem_counter = this%nvshmem_counter + 1
358 do i = 1,
size(this%send_pe)
359 call cuda_gs_pack_and_push_wait(this%stream(i), &
360 this%nvshmem_counter -
size(this%send_pe) + i - 1, &
364 do done_req = 1,
size(this%recv_pe)
366 this%recv_buf%buf_d, &
367 this%recv_buf%dof_d, &
368 this%recv_buf%offset(done_req), &
369 this%recv_buf%ndofs(done_req), &
370 this%stream(done_req))
375 do done_req = 1,
size(this%recv_pe)
377 this%event(done_req), 0)
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Copy data between host and device (or device and device)
Synchronize a device or stream.
type(mpi_comm), public neko_comm
MPI communicator.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
integer, public strm_high_prio
High priority stream setting.
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Defines a gather-scatter communication method.
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_shmem_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_shmem_free(this)
Deallocate MPI based communication method.
subroutine gs_device_shmem_buf_free(this)
subroutine gs_device_shmem_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Defines Gather-scatter operations.
Implements a hash table ADT.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Implements a dynamic stack ADT.
Gather-scatter communication method.
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Integer based hash table.