Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device_mpi.F90
Go to the documentation of this file.
1! Copyright (c) 2020-2023, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_comm, only : gs_comm_t
37 use gs_ops
38 use stack, only : stack_i4_t
39 use comm, only : pe_size, pe_rank
40 use htable, only : htable_i4_t
41 use device
42 use utils, only : neko_error
43 use, intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t
44 implicit none
45 private
46
48 type, private :: gs_device_mpi_buf_t
49 integer, allocatable :: ndofs(:)
50 integer, allocatable :: offset(:)
51 integer :: total
52 type(c_ptr) :: reqs = c_null_ptr
53 type(c_ptr) :: buf_d = c_null_ptr
54 type(c_ptr) :: dof_d = c_null_ptr
55 contains
56 procedure, pass(this) :: init => gs_device_mpi_buf_init
57 procedure, pass(this) :: free => gs_device_mpi_buf_free
58 end type gs_device_mpi_buf_t
59
62 type, public, extends(gs_comm_t) :: gs_device_mpi_t
63 type(gs_device_mpi_buf_t) :: send_buf
64 type(gs_device_mpi_buf_t) :: recv_buf
65 type(c_ptr), allocatable :: stream(:)
66 type(c_ptr), allocatable :: event(:)
67 integer :: nb_strtgy
68 type(c_ptr) :: send_event = c_null_ptr
69 contains
70 procedure, pass(this) :: init => gs_device_mpi_init
71 procedure, pass(this) :: free => gs_device_mpi_free
72 procedure, pass(this) :: nbsend => gs_device_mpi_nbsend
73 procedure, pass(this) :: nbrecv => gs_device_mpi_nbrecv
74 procedure, pass(this) :: nbwait => gs_device_mpi_nbwait
75 end type gs_device_mpi_t
76
77#ifdef HAVE_HIP
78 interface
79 subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
80 bind(c, name='hip_gs_pack')
81 use, intrinsic :: iso_c_binding
82 implicit none
83 integer(c_int), value :: n, offset
84 type(c_ptr), value :: u_d, buf_d, dof_d, stream
85 end subroutine hip_gs_pack
86 end interface
87
88 interface
89 subroutine hip_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
90 bind(c, name='hip_gs_unpack')
91 use, intrinsic :: iso_c_binding
92 implicit none
93 integer(c_int), value :: op, offset, n
94 type(c_ptr), value :: u_d, buf_d, dof_d, stream
95 end subroutine hip_gs_unpack
96 end interface
97#elif HAVE_CUDA
98 interface
99 subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
100 bind(c, name='cuda_gs_pack')
101 use, intrinsic :: iso_c_binding
102 implicit none
103 integer(c_int), value :: n, offset
104 type(c_ptr), value :: u_d, buf_d, dof_d, stream
105 end subroutine cuda_gs_pack
106 end interface
107
108 interface
109 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
110 bind(c, name='cuda_gs_unpack')
111 use, intrinsic :: iso_c_binding
112 implicit none
113 integer(c_int), value :: op, offset, n
114 type(c_ptr), value :: u_d, buf_d, dof_d, stream
115 end subroutine cuda_gs_unpack
116 end interface
117#endif
118
119 interface
120 subroutine device_mpi_init_reqs(n, reqs) &
121 bind(c, name='device_mpi_init_reqs')
122 use, intrinsic :: iso_c_binding
123 implicit none
124 integer(c_int), value :: n
125 type(c_ptr) :: reqs
126 end subroutine device_mpi_init_reqs
127 end interface
128
129 interface
130 subroutine device_mpi_free_reqs(reqs) &
131 bind(c, name='device_mpi_free_reqs')
132 use, intrinsic :: iso_c_binding
133 implicit none
134 type(c_ptr) :: reqs
135 end subroutine device_mpi_free_reqs
136 end interface
137
138 interface
139 subroutine device_mpi_isend(buf_d, offset, nbytes, rank, reqs, i) &
140 bind(c, name='device_mpi_isend')
141 use, intrinsic :: iso_c_binding
142 implicit none
143 integer(c_int), value :: offset, nbytes, rank, i
144 type(c_ptr), value :: buf_d, reqs
145 end subroutine device_mpi_isend
146 end interface
147
148 interface
149 subroutine device_mpi_irecv(buf_d, offset, nbytes, rank, reqs, i) &
150 bind(c, name='device_mpi_irecv')
151 use, intrinsic :: iso_c_binding
152 implicit none
153 integer(c_int), value :: offset, nbytes, rank, i
154 type(c_ptr), value :: buf_d, reqs
155 end subroutine device_mpi_irecv
156 end interface
157
158 interface
159 integer(c_int) function device_mpi_test(reqs, i) &
160 bind(c, name='device_mpi_test')
161 use, intrinsic :: iso_c_binding
162 implicit none
163 integer(c_int), value :: i
164 type(c_ptr), value :: reqs
165 end function device_mpi_test
166 end interface
167
168 interface
169 subroutine device_mpi_waitall(n, reqs) &
170 bind(c, name='device_mpi_waitall')
171 use, intrinsic :: iso_c_binding
172 implicit none
173 integer(c_int), value :: n
174 type(c_ptr), value :: reqs
175 end subroutine device_mpi_waitall
176 end interface
177
178 interface
179 integer(c_int) function device_mpi_waitany(n, reqs, i) &
180 bind(c, name='device_mpi_waitany')
181 use, intrinsic :: iso_c_binding
182 implicit none
183 integer(c_int), value :: n
184 integer(c_int) :: i
185 type(c_ptr), value :: reqs
186 end function device_mpi_waitany
187 end interface
188
189contains
190
191 subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
192 class(gs_device_mpi_buf_t), intent(inout) :: this
193 integer, allocatable, intent(inout) :: pe_order(:)
194 type(stack_i4_t), allocatable, intent(inout) :: dof_stack(:)
195 logical, intent(in) :: mark_dupes
196 integer, allocatable :: dofs(:)
197 integer :: i, j, total
198 integer(c_size_t) :: sz
199 type(htable_i4_t) :: doftable
200 integer :: dupe, marked, k
201 real(c_rp) :: rp_dummy
202 integer(c_int32_t) :: i4_dummy
203
204 call device_mpi_init_reqs(size(pe_order), this%reqs)
205
206 allocate(this%ndofs(size(pe_order)))
207 allocate(this%offset(size(pe_order)))
208
209 total = 0
210 do i = 1, size(pe_order)
211 this%ndofs(i) = dof_stack(pe_order(i))%size()
212 this%offset(i) = total
213 total = total + this%ndofs(i)
214 end do
215
216 this%total = total
217
218 sz = c_sizeof(rp_dummy) * total
219 call device_alloc(this%buf_d, sz)
220
221 sz = c_sizeof(i4_dummy) * total
222 call device_alloc(this%dof_d, sz)
223
224 if (mark_dupes) call doftable%init(2*total)
225 allocate(dofs(total))
226
227 ! Copy from dof_stack into dofs, optionally marking duplicates with doftable
228 marked = 0
229 do i = 1, size(pe_order)
230 ! %array() breaks on cray
231 select type (arr => dof_stack(pe_order(i))%data)
232 type is (integer)
233 do j = 1, this%ndofs(i)
234 k = this%offset(i) + j
235 if (mark_dupes) then
236 if (doftable%get(arr(j), dupe) .eq. 0) then
237 if (dofs(dupe) .gt. 0) then
238 dofs(dupe) = -dofs(dupe)
239 marked = marked + 1
240 end if
241 dofs(k) = -arr(j)
242 marked = marked + 1
243 else
244 call doftable%set(arr(j), k)
245 dofs(k) = arr(j)
246 end if
247 else
248 dofs(k) = arr(j)
249 end if
250 end do
251 end select
252 end do
253
254 call device_memcpy(dofs, this%dof_d, total, host_to_device, sync=.false.)
255
256 deallocate(dofs)
257 call doftable%free()
258
259 end subroutine gs_device_mpi_buf_init
260
261 subroutine gs_device_mpi_buf_free(this)
262 class(gs_device_mpi_buf_t), intent(inout) :: this
263
264 if (c_associated(this%reqs)) call device_mpi_free_reqs(this%reqs)
265
266 if (allocated(this%ndofs)) deallocate(this%ndofs)
267 if (allocated(this%offset)) deallocate(this%offset)
268
269 if (c_associated(this%buf_d)) call device_free(this%buf_d)
270 if (c_associated(this%dof_d)) call device_free(this%dof_d)
271 end subroutine gs_device_mpi_buf_free
272
274 subroutine gs_device_mpi_init(this, send_pe, recv_pe)
275 class(gs_device_mpi_t), intent(inout) :: this
276 type(stack_i4_t), intent(inout) :: send_pe
277 type(stack_i4_t), intent(inout) :: recv_pe
278 integer :: i
279
280 call this%init_order(send_pe, recv_pe)
281
282 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
283 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
284
285#if defined(HAVE_HIP) || defined(HAVE_CUDA)
286 ! Create a set of non-blocking streams
287 allocate(this%stream(size(this%recv_pe)))
288 do i = 1, size(this%recv_pe)
289 call device_stream_create_with_priority(this%stream(i), 1, strm_high_prio)
290 end do
291
292 allocate(this%event(size(this%recv_pe)))
293 do i = 1, size(this%recv_pe)
294 call device_event_create(this%event(i), 2)
295 end do
296#endif
297
298
299 this%nb_strtgy = 0
300
301 end subroutine gs_device_mpi_init
302
304 subroutine gs_device_mpi_free(this)
305 class(gs_device_mpi_t), intent(inout) :: this
306 integer :: i
307
308 call this%send_buf%free()
309 call this%recv_buf%free()
310
311 call this%free_order()
312 call this%free_dofs()
313
314#if defined(HAVE_HIP) || defined(HAVE_CUDA)
315 if (allocated(this%stream)) then
316 do i = 1, size(this%stream)
317 call device_stream_destroy(this%stream(i))
318 end do
319 deallocate(this%stream)
320 end if
321#endif
322
323 end subroutine gs_device_mpi_free
324
326 subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
327 class(gs_device_mpi_t), intent(inout) :: this
328 integer, intent(in) :: n
329 real(kind=rp), dimension(n), intent(inout) :: u
330 type(c_ptr), intent(inout) :: deps
331 type(c_ptr), intent(inout) :: strm
332 integer :: i
333 type(c_ptr) :: u_d
334
335 u_d = device_get_ptr(u)
336
337 if (iand(this%nb_strtgy, 1) .eq. 0) then
338
339#ifdef HAVE_HIP
340 call hip_gs_pack(u_d, &
341 this%send_buf%buf_d, &
342 this%send_buf%dof_d, &
343 0, this%send_buf%total, &
344 strm)
345#elif HAVE_CUDA
346 call cuda_gs_pack(u_d, &
347 this%send_buf%buf_d, &
348 this%send_buf%dof_d, &
349 0, this%send_buf%total, &
350 strm)
351#else
352 call neko_error('gs_device_mpi: no backend')
353#endif
354
355 call device_sync(strm)
356
357 do i = 1, size(this%send_pe)
358 call device_mpi_isend(this%send_buf%buf_d, &
359 rp*this%send_buf%offset(i), &
360 rp*this%send_buf%ndofs(i), this%send_pe(i), &
361 this%send_buf%reqs, i)
362 end do
363
364 else
365
366 do i = 1, size(this%send_pe)
367 call device_stream_wait_event(this%stream(i), deps, 0)
368#ifdef HAVE_HIP
369 call hip_gs_pack(u_d, &
370 this%send_buf%buf_d, &
371 this%send_buf%dof_d, &
372 this%send_buf%offset(i), &
373 this%send_buf%ndofs(i), &
374 this%stream(i))
375#elif HAVE_CUDA
376 call cuda_gs_pack(u_d, &
377 this%send_buf%buf_d, &
378 this%send_buf%dof_d, &
379 this%send_buf%offset(i), &
380 this%send_buf%ndofs(i), &
381 this%stream(i))
382#else
383 call neko_error('gs_device_mpi: no backend')
384#endif
385 end do
386
387 ! Consider adding a poll loop here once we have device_query in place
388 do i = 1, size(this%send_pe)
389 call device_sync(this%stream(i))
390 call device_mpi_isend(this%send_buf%buf_d, &
391 rp*this%send_buf%offset(i), &
392 rp*this%send_buf%ndofs(i), this%send_pe(i), &
393 this%send_buf%reqs, i)
394 end do
395 end if
396
397 end subroutine gs_device_mpi_nbsend
398
400 subroutine gs_device_mpi_nbrecv(this)
401 class(gs_device_mpi_t), intent(inout) :: this
402 integer :: i
403
404 do i = 1, size(this%recv_pe)
405 call device_mpi_irecv(this%recv_buf%buf_d, rp*this%recv_buf%offset(i), &
406 rp*this%recv_buf%ndofs(i), this%recv_pe(i), &
407 this%recv_buf%reqs, i)
408 end do
409
410 end subroutine gs_device_mpi_nbrecv
411
413 subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
414 class(gs_device_mpi_t), intent(inout) :: this
415 integer, intent(in) :: n
416 real(kind=rp), dimension(n), intent(inout) :: u
417 type(c_ptr), intent(inout) :: strm
418 integer :: op, done_req, i
419 type(c_ptr) :: u_d
420
421 u_d = device_get_ptr(u)
422
423 if (iand(this%nb_strtgy, 2) .eq. 0) then
424 call device_mpi_waitall(size(this%recv_pe), this%recv_buf%reqs)
425
426#ifdef HAVE_HIP
427 call hip_gs_unpack(u_d, op, &
428 this%recv_buf%buf_d, &
429 this%recv_buf%dof_d, &
430 0, this%recv_buf%total, &
431 strm)
432#elif HAVE_CUDA
433 call cuda_gs_unpack(u_d, op, &
434 this%recv_buf%buf_d, &
435 this%recv_buf%dof_d, &
436 0, this%recv_buf%total, &
437 strm)
438#else
439 call neko_error('gs_device_mpi: no backend')
440#endif
441
442 call device_mpi_waitall(size(this%send_pe), this%send_buf%reqs)
443
444 ! Syncing here seems to prevent some race condition
445 call device_sync(strm)
446
447 else
448
449 do while(device_mpi_waitany(size(this%recv_pe), &
450 this%recv_buf%reqs, done_req) .ne. 0)
451
452#ifdef HAVE_HIP
453 call hip_gs_unpack(u_d, op, &
454 this%recv_buf%buf_d, &
455 this%recv_buf%dof_d, &
456 this%recv_buf%offset(done_req), &
457 this%recv_buf%ndofs(done_req), &
458 this%stream(done_req))
459#elif HAVE_CUDA
460 call cuda_gs_unpack(u_d, op, &
461 this%recv_buf%buf_d, &
462 this%recv_buf%dof_d, &
463 this%recv_buf%offset(done_req), &
464 this%recv_buf%ndofs(done_req), &
465 this%stream(done_req))
466#else
467 call neko_error('gs_device_mpi: no backend')
468#endif
469 call device_event_record(this%event(done_req), this%stream(done_req))
470 end do
471
472 call device_mpi_waitall(size(this%send_pe), this%send_buf%reqs)
473
474 ! Sync non-blocking streams
475 do done_req = 1, size(this%recv_pe)
476 call device_stream_wait_event(strm, &
477 this%event(done_req), 0)
478 end do
479
480 end if
481
482 end subroutine gs_device_mpi_nbwait
483
484end module gs_device_mpi
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:139
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:116
Return the device pointer for an associated Fortran array.
Definition device.F90:81
Copy data between host and device (or device and device)
Definition device.F90:51
Synchronize a device or stream.
Definition device.F90:87
Definition comm.F90:1
integer pe_rank
MPI rank.
Definition comm.F90:28
integer pe_size
MPI size of communicator.
Definition comm.F90:31
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1210
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:185
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:164
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
Definition device.F90:1123
subroutine device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
Definition device.F90:1088
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1164
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Definition device.F90:1105
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_mpi_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_mpi_free(this)
Deallocate MPI based communication method.
subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_mpi_buf_free(this)
subroutine gs_device_mpi_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Defines Gather-scatter operations.
Definition gs_ops.f90:34
Implements a hash table ADT.
Definition htable.f90:36
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Implements a dynamic stack ADT.
Definition stack.f90:35
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:45
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Integer based hash table.
Definition htable.f90:82
Integer based stack.
Definition stack.f90:63