Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device_shmem.F90
Go to the documentation of this file.
1! Copyright (c) 2020-2024, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_comm, only : gs_comm_t
37 use stack, only : stack_i4_t
38 use htable, only : htable_i4_t
39 use device
40 use comm, only : pe_size, pe_rank, neko_comm
41 use mpi_f08, only : mpi_allreduce, mpi_integer, &
42 mpi_max, mpi_sendrecv, mpi_status_ignore
43 use utils, only : neko_error
44 use, intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
45 c_ptr, c_null_ptr, c_size_t, c_associated
46 implicit none
47 private
48
50 type, private :: gs_device_shmem_buf_t
51 integer, allocatable :: ndofs(:)
52 integer, allocatable :: offset(:)
53 integer, allocatable :: remote_offset(:)
54 integer :: total
55 type(c_ptr) :: buf_d = c_null_ptr
56 type(c_ptr) :: dof_d = c_null_ptr
57 contains
58 procedure, pass(this) :: init => gs_device_shmem_buf_init
59 procedure, pass(this) :: free => gs_device_shmem_buf_free
61
64 type, public, extends(gs_comm_t) :: gs_device_shmem_t
65 type(gs_device_shmem_buf_t) :: send_buf
66 type(gs_device_shmem_buf_t) :: recv_buf
67 type(c_ptr), allocatable :: stream(:)
68 type(c_ptr), allocatable :: event(:)
69 integer :: nvshmem_counter = 1
70 type(c_ptr), allocatable :: notifydone(:)
71 type(c_ptr), allocatable :: notifyready(:)
72 contains
73 procedure, pass(this) :: init => gs_device_shmem_init
74 procedure, pass(this) :: free => gs_device_shmem_free
75 procedure, pass(this) :: nbsend => gs_device_shmem_nbsend
76 procedure, pass(this) :: nbrecv => gs_device_shmem_nbrecv
77 procedure, pass(this) :: nbwait => gs_device_shmem_nbwait
78 end type gs_device_shmem_t
79
80
81#if defined (HAVE_CUDA) && defined(HAVE_NVSHMEM)
82
83 interface
84 subroutine cudamalloc_nvshmem(ptr, size) &
85 bind(c, name = 'cudamalloc_nvshmem')
86 use, intrinsic :: iso_c_binding
87 implicit none
88 type(c_ptr) :: ptr
89 integer(c_size_t), value :: size
90 end subroutine cudamalloc_nvshmem
91 end interface
92
93 interface
94 subroutine cudafree_nvshmem(ptr) &
95 bind(c, name = 'cudafree_nvshmem')
96 use, intrinsic :: iso_c_binding
97 implicit none
98 type(c_ptr) :: ptr
99 end subroutine cudafree_nvshmem
100 end interface
101
102 interface
103 subroutine cuda_gs_pack_and_push(u_d, buf_d, dof_d, offset, n, stream, &
104 srank, rbuf_d, roffset, remote_offset, &
105 rrank, nvshmem_counter, notifyDone, &
106 notifyReady, iter) &
107 bind(c, name = 'cuda_gs_pack_and_push')
108 use, intrinsic :: iso_c_binding
109 implicit none
110 integer(c_int), value :: n, offset, srank, roffset, rrank, iter
111 integer(c_int), value :: nvshmem_counter
112 type(c_ptr), value :: u_d, buf_d, dof_d, stream, rbuf_d, notifydone, &
113 notifyready
114 integer(c_int), dimension(*) :: remote_offset
115 end subroutine cuda_gs_pack_and_push
116 end interface
117
118 interface
119 subroutine cuda_gs_pack_and_push_wait(stream, nvshmem_counter, &
120 notifyDone) bind(c, name = 'cuda_gs_pack_and_push_wait')
121 use, intrinsic :: iso_c_binding
122 implicit none
123 integer(c_int), value :: nvshmem_counter
124 type(c_ptr), value :: stream, notifydone
125 end subroutine cuda_gs_pack_and_push_wait
126 end interface
127
128 interface
129 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
130 bind(c, name = 'cuda_gs_unpack')
131 use, intrinsic :: iso_c_binding
132 implicit none
133 integer(c_int), value :: op, offset, n
134 type(c_ptr), value :: u_d, buf_d, dof_d, stream
135 end subroutine cuda_gs_unpack
136 end interface
137#endif
138
139contains
140
141 subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
142 class(gs_device_shmem_buf_t), intent(inout) :: this
143 integer, allocatable, intent(inout) :: pe_order(:)
144 type(stack_i4_t), allocatable, intent(inout) :: dof_stack(:)
145 logical, intent(in) :: mark_dupes
146 integer, allocatable :: dofs(:)
147 integer :: i, j, total, max_total
148 integer(c_size_t) :: sz
149 type(htable_i4_t) :: doftable
150 integer :: dupe, marked, k
151 real(c_rp) :: rp_dummy
152 integer(c_int32_t) :: i4_dummy
153
154 allocate(this%ndofs(size(pe_order)))
155 allocate(this%offset(size(pe_order)))
156 allocate(this%remote_offset(size(pe_order)))
157
158 do i = 1, size(pe_order)
159 this%remote_offset(i) = -1
160 end do
161
162 total = 0
163 do i = 1, size(pe_order)
164 this%ndofs(i) = dof_stack(pe_order(i))%size()
165 this%offset(i) = total
166 total = total + this%ndofs(i)
167 end do
168
169 call mpi_allreduce(total, max_total, 1, mpi_integer, mpi_max, neko_comm)
170
171 this%total = total
172
173 sz = c_sizeof(rp_dummy) * max_total
174#ifdef HAVE_NVSHMEM
175 call cudamalloc_nvshmem(this%buf_d, sz)
176#endif
177
178 sz = c_sizeof(i4_dummy) * total
179 call device_alloc(this%dof_d, sz)
180
181 if (mark_dupes) call doftable%init(2*total)
182 allocate(dofs(total))
183
184 ! Copy from dof_stack into dofs, optionally marking duplicates with doftable
185 marked = 0
186 do i = 1, size(pe_order)
187 ! %array() breaks on cray
188 select type (arr => dof_stack(pe_order(i))%data)
189 type is (integer)
190 do j = 1, this%ndofs(i)
191 k = this%offset(i) + j
192 if (mark_dupes) then
193 if (doftable%get(arr(j), dupe) .eq. 0) then
194 if (dofs(dupe) .gt. 0) then
195 dofs(dupe) = -dofs(dupe)
196 marked = marked + 1
197 end if
198 dofs(k) = -arr(j)
199 marked = marked + 1
200 else
201 call doftable%set(arr(j), k)
202 dofs(k) = arr(j)
203 end if
204 else
205 dofs(k) = arr(j)
206 end if
207 end do
208 end select
209 end do
210
211 call device_memcpy(dofs, this%dof_d, total, host_to_device, sync = .true.)
212
213 deallocate(dofs)
214 call doftable%free()
215
216 end subroutine gs_device_shmem_buf_init
217
219 class(gs_device_shmem_buf_t), intent(inout) :: this
220
221
222 if (allocated(this%ndofs)) deallocate(this%ndofs)
223 if (allocated(this%offset)) deallocate(this%offset)
224
225#ifdef HAVE_NVSHMEM
226 if (c_associated(this%buf_d)) call cudafree_nvshmem(this%buf_d)
227#endif
228 if (c_associated(this%dof_d)) call device_free(this%dof_d)
229
230 end subroutine gs_device_shmem_buf_free
231
233 subroutine gs_device_shmem_init(this, send_pe, recv_pe)
234 class(gs_device_shmem_t), intent(inout) :: this
235 type(stack_i4_t), intent(inout) :: send_pe
236 type(stack_i4_t), intent(inout) :: recv_pe
237 integer :: i
238
239 call this%init_order(send_pe, recv_pe)
240
241 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
242 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
243
244#if defined(HAVE_HIP) || defined(HAVE_CUDA)
245 ! Create a set of non-blocking streams
246 allocate(this%stream(size(this%recv_pe)))
247 do i = 1, size(this%recv_pe)
248 call device_stream_create_with_priority(this%stream(i), 1, &
250 end do
251
252 allocate(this%event(size(this%recv_pe)))
253 do i = 1, size(this%recv_pe)
254 call device_event_create(this%event(i), 2)
255 end do
256
257#ifdef HAVE_NVSHMEM
258 allocate(this%notifyDone(size(this%recv_pe)))
259 allocate(this%notifyReady(size(this%recv_pe)))
260 do i = 1, size(this%recv_pe)
261 call cudamalloc_nvshmem(this%notifyDone(i), 8_8)
262 call cudamalloc_nvshmem(this%notifyReady(i), 8_8)
263 end do
264#endif
265#endif
266
267 end subroutine gs_device_shmem_init
268
270 subroutine gs_device_shmem_free(this)
271 class(gs_device_shmem_t), intent(inout) :: this
272 integer :: i
273
274 call this%send_buf%free()
275 call this%recv_buf%free()
276
277 call this%free_order()
278 call this%free_dofs()
279
280#if defined(HAVE_HIP) || defined(HAVE_CUDA)
281 if (allocated(this%stream)) then
282 do i = 1, size(this%stream)
283 call device_stream_destroy(this%stream(i))
284 end do
285 deallocate(this%stream)
286 end if
287#endif
288
289 end subroutine gs_device_shmem_free
290
292 subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
293 class(gs_device_shmem_t), intent(inout) :: this
294 integer, intent(in) :: n
295 real(kind=rp), dimension(n), intent(inout) :: u
296 type(c_ptr), intent(inout) :: deps
297 type(c_ptr), intent(inout) :: strm
298 integer :: i
299 type(c_ptr) :: u_d
300
301 u_d = device_get_ptr(u)
302
303 do i = 1, size(this%send_pe)
304 call device_stream_wait_event(this%stream(i), deps, 0)
305 ! Not clear why this sync is required, but there seems to be a race condition
306 ! without it for certain run configs
307 call device_sync(this%stream(i))
308 end do
309
310 ! We do the rest in the "wait" routine below
311
312 end subroutine gs_device_shmem_nbsend
313
315 subroutine gs_device_shmem_nbrecv(this)
316 class(gs_device_shmem_t), intent(inout) :: this
317 integer :: i
318
319 ! We do everything in the "wait" routine below
320
321 end subroutine gs_device_shmem_nbrecv
322
324 subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
325 class(gs_device_shmem_t), intent(inout) :: this
326 integer, intent(in) :: n
327 real(kind=rp), dimension(n), intent(inout) :: u
328 type(c_ptr), intent(inout) :: strm
329 integer :: op, done_req, i
330 type(c_ptr) :: u_d
331
332 u_d = device_get_ptr(u)
333#ifdef HAVE_NVSHMEM
334 do i = 1, size(this%send_pe)
335 if (this%recv_buf%remote_offset(i) .eq. -1) then
336 call mpi_sendrecv(this%recv_buf%offset(i), 1, mpi_integer, &
337 this%recv_pe(i), 0, &
338 this%recv_buf%remote_offset(i), 1, mpi_integer, &
339 this%send_pe(i), 0, neko_comm, mpi_status_ignore)
340 end if
341
342 call cuda_gs_pack_and_push(u_d, &
343 this%send_buf%buf_d, &
344 this%send_buf%dof_d, &
345 this%send_buf%offset(i), &
346 this%send_buf%ndofs(i), &
347 this%stream(i), &
348 this%send_pe(i), &
349 this%recv_buf%buf_d, &
350 this%recv_buf%offset(i), &
351 this%recv_buf%remote_offset, &
352 this%recv_pe(i), &
353 this%nvshmem_counter, &
354 this%notifyDone(i), &
355 this%notifyReady(i), &
356 i)
357 this%nvshmem_counter = this%nvshmem_counter + 1
358 end do
359
360 do i = 1, size(this%send_pe)
361 call cuda_gs_pack_and_push_wait(this%stream(i), &
362 this%nvshmem_counter - size(this%send_pe) + i - 1, &
363 this%notifyDone(i))
364 end do
365
366 do done_req = 1, size(this%recv_pe)
367 call cuda_gs_unpack(u_d, op, &
368 this%recv_buf%buf_d, &
369 this%recv_buf%dof_d, &
370 this%recv_buf%offset(done_req), &
371 this%recv_buf%ndofs(done_req), &
372 this%stream(done_req))
373 call device_event_record(this%event(done_req), this%stream(done_req))
374 end do
375
376 ! Sync non-blocking streams
377 do done_req = 1, size(this%recv_pe)
378 call device_stream_wait_event(strm, &
379 this%event(done_req), 0)
380 end do
381#endif
382 end subroutine gs_device_shmem_nbwait
383
384end module gs_device_shmem
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:132
Return the device pointer for an associated Fortran array.
Definition device.F90:101
Copy data between host and device (or device and device)
Definition device.F90:71
Synchronize a device or stream.
Definition device.F90:107
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
integer, public pe_rank
MPI rank.
Definition comm.F90:56
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:43
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1295
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:192
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
Definition device.F90:1173
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
Definition device.F90:1208
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
integer, public strm_high_prio
High priority stream setting.
Definition device.F90:65
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Definition device.F90:1190
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_shmem_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_shmem_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_shmem_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_shmem_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_shmem_free(this)
Deallocate MPI based communication method.
subroutine gs_device_shmem_buf_free(this)
subroutine gs_device_shmem_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Implements a hash table ADT.
Definition htable.f90:36
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Implements a dynamic stack ADT.
Definition stack.f90:35
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:46
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Integer based hash table.
Definition htable.f90:82
Integer based stack.
Definition stack.f90:63