Neko  0.9.99
A portable framework for high-order spectral element flow simulations
gs_device_mpi.F90
Go to the documentation of this file.
1 ! Copyright (c) 2020-2023, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
35  use num_types, only : rp, c_rp
36  use gs_comm, only : gs_comm_t
37  use gs_ops
38  use stack, only : stack_i4_t
39  use comm, only : pe_size, pe_rank
40  use htable, only : htable_i4_t
41  use device
42  use utils, only : neko_error
43  use, intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t
44  implicit none
45  private
46 
48  type, private :: gs_device_mpi_buf_t
49  integer, allocatable :: ndofs(:)
50  integer, allocatable :: offset(:)
51  integer :: total
52  type(c_ptr) :: reqs = c_null_ptr
53  type(c_ptr) :: buf_d = c_null_ptr
54  type(c_ptr) :: dof_d = c_null_ptr
55  contains
56  procedure, pass(this) :: init => gs_device_mpi_buf_init
57  procedure, pass(this) :: free => gs_device_mpi_buf_free
58  end type gs_device_mpi_buf_t
59 
62  type, public, extends(gs_comm_t) :: gs_device_mpi_t
63  type(gs_device_mpi_buf_t) :: send_buf
64  type(gs_device_mpi_buf_t) :: recv_buf
65  type(c_ptr), allocatable :: stream(:)
66  type(c_ptr), allocatable :: event(:)
67  integer :: nb_strtgy
68  type(c_ptr) :: send_event = c_null_ptr
69  contains
70  procedure, pass(this) :: init => gs_device_mpi_init
71  procedure, pass(this) :: free => gs_device_mpi_free
72  procedure, pass(this) :: nbsend => gs_device_mpi_nbsend
73  procedure, pass(this) :: nbrecv => gs_device_mpi_nbrecv
74  procedure, pass(this) :: nbwait => gs_device_mpi_nbwait
75  end type gs_device_mpi_t
76 
77 #ifdef HAVE_HIP
78  interface
79  subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
80  bind(c, name='hip_gs_pack')
81  use, intrinsic :: iso_c_binding
82  implicit none
83  integer(c_int), value :: n, offset
84  type(c_ptr), value :: u_d, buf_d, dof_d, stream
85  end subroutine hip_gs_pack
86  end interface
87 
88  interface
89  subroutine hip_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
90  bind(c, name='hip_gs_unpack')
91  use, intrinsic :: iso_c_binding
92  implicit none
93  integer(c_int), value :: op, offset, n
94  type(c_ptr), value :: u_d, buf_d, dof_d, stream
95  end subroutine hip_gs_unpack
96  end interface
97 #elif HAVE_CUDA
98  interface
99  subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
100  bind(c, name='cuda_gs_pack')
101  use, intrinsic :: iso_c_binding
102  implicit none
103  integer(c_int), value :: n, offset
104  type(c_ptr), value :: u_d, buf_d, dof_d, stream
105  end subroutine cuda_gs_pack
106  end interface
107 
108  interface
109  subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
110  bind(c, name='cuda_gs_unpack')
111  use, intrinsic :: iso_c_binding
112  implicit none
113  integer(c_int), value :: op, offset, n
114  type(c_ptr), value :: u_d, buf_d, dof_d, stream
115  end subroutine cuda_gs_unpack
116  end interface
117 #endif
118 
119  interface
120  subroutine device_mpi_init_reqs(n, reqs) &
121  bind(c, name='device_mpi_init_reqs')
122  use, intrinsic :: iso_c_binding
123  implicit none
124  integer(c_int), value :: n
125  type(c_ptr) :: reqs
126  end subroutine device_mpi_init_reqs
127  end interface
128 
129  interface
130  subroutine device_mpi_free_reqs(reqs) &
131  bind(c, name='device_mpi_free_reqs')
132  use, intrinsic :: iso_c_binding
133  implicit none
134  type(c_ptr) :: reqs
135  end subroutine device_mpi_free_reqs
136  end interface
137 
138  interface
139  subroutine device_mpi_isend(buf_d, offset, nbytes, rank, reqs, i) &
140  bind(c, name='device_mpi_isend')
141  use, intrinsic :: iso_c_binding
142  implicit none
143  integer(c_int), value :: offset, nbytes, rank, i
144  type(c_ptr), value :: buf_d, reqs
145  end subroutine device_mpi_isend
146  end interface
147 
148  interface
149  subroutine device_mpi_irecv(buf_d, offset, nbytes, rank, reqs, i) &
150  bind(c, name='device_mpi_irecv')
151  use, intrinsic :: iso_c_binding
152  implicit none
153  integer(c_int), value :: offset, nbytes, rank, i
154  type(c_ptr), value :: buf_d, reqs
155  end subroutine device_mpi_irecv
156  end interface
157 
158  interface
159  integer(c_int) function device_mpi_test(reqs, i) &
160  bind(c, name='device_mpi_test')
161  use, intrinsic :: iso_c_binding
162  implicit none
163  integer(c_int), value :: i
164  type(c_ptr), value :: reqs
165  end function device_mpi_test
166  end interface
167 
168  interface
169  subroutine device_mpi_waitall(n, reqs) &
170  bind(c, name='device_mpi_waitall')
171  use, intrinsic :: iso_c_binding
172  implicit none
173  integer(c_int), value :: n
174  type(c_ptr), value :: reqs
175  end subroutine device_mpi_waitall
176  end interface
177 
178  interface
179  integer(c_int) function device_mpi_waitany(n, reqs, i) &
180  bind(c, name='device_mpi_waitany')
181  use, intrinsic :: iso_c_binding
182  implicit none
183  integer(c_int), value :: n
184  integer(c_int) :: i
185  type(c_ptr), value :: reqs
186  end function device_mpi_waitany
187  end interface
188 
189 contains
190 
191  subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
192  class(gs_device_mpi_buf_t), intent(inout) :: this
193  integer, allocatable, intent(inout) :: pe_order(:)
194  type(stack_i4_t), allocatable, intent(inout) :: dof_stack(:)
195  logical, intent(in) :: mark_dupes
196  integer, allocatable :: dofs(:)
197  integer :: i, j, total
198  integer(c_size_t) :: sz
199  type(htable_i4_t) :: doftable
200  integer :: dupe, marked, k
201  real(c_rp) :: rp_dummy
202  integer(c_int32_t) :: i4_dummy
203 
204  call device_mpi_init_reqs(size(pe_order), this%reqs)
205 
206  allocate(this%ndofs(size(pe_order)))
207  allocate(this%offset(size(pe_order)))
208 
209  total = 0
210  do i = 1, size(pe_order)
211  this%ndofs(i) = dof_stack(pe_order(i))%size()
212  this%offset(i) = total
213  total = total + this%ndofs(i)
214  end do
215 
216  this%total = total
217 
218  sz = c_sizeof(rp_dummy) * total
219  call device_alloc(this%buf_d, sz)
220 
221  sz = c_sizeof(i4_dummy) * total
222  call device_alloc(this%dof_d, sz)
223 
224  if (mark_dupes) call doftable%init(2*total)
225  allocate(dofs(total))
226 
227  ! Copy from dof_stack into dofs, optionally marking duplicates with doftable
228  marked = 0
229  do i = 1, size(pe_order)
230  ! %array() breaks on cray
231  select type (arr => dof_stack(pe_order(i))%data)
232  type is (integer)
233  do j = 1, this%ndofs(i)
234  k = this%offset(i) + j
235  if (mark_dupes) then
236  if (doftable%get(arr(j), dupe) .eq. 0) then
237  if (dofs(dupe) .gt. 0) then
238  dofs(dupe) = -dofs(dupe)
239  marked = marked + 1
240  end if
241  dofs(k) = -arr(j)
242  marked = marked + 1
243  else
244  call doftable%set(arr(j), k)
245  dofs(k) = arr(j)
246  end if
247  else
248  dofs(k) = arr(j)
249  end if
250  end do
251  end select
252  end do
253 
254  call device_memcpy(dofs, this%dof_d, total, host_to_device, sync=.false.)
255 
256  deallocate(dofs)
257  call doftable%free()
258 
259  end subroutine gs_device_mpi_buf_init
260 
261  subroutine gs_device_mpi_buf_free(this)
262  class(gs_device_mpi_buf_t), intent(inout) :: this
263 
264  if (c_associated(this%reqs)) call device_mpi_free_reqs(this%reqs)
265 
266  if (allocated(this%ndofs)) deallocate(this%ndofs)
267  if (allocated(this%offset)) deallocate(this%offset)
268 
269  if (c_associated(this%buf_d)) call device_free(this%buf_d)
270  if (c_associated(this%dof_d)) call device_free(this%dof_d)
271  end subroutine gs_device_mpi_buf_free
272 
274  subroutine gs_device_mpi_init(this, send_pe, recv_pe)
275  class(gs_device_mpi_t), intent(inout) :: this
276  type(stack_i4_t), intent(inout) :: send_pe
277  type(stack_i4_t), intent(inout) :: recv_pe
278  integer :: i
279 
280  call this%init_order(send_pe, recv_pe)
281 
282  call this%send_buf%init(this%send_pe, this%send_dof, .false.)
283  call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
284 
285 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
286  ! Create a set of non-blocking streams
287  allocate(this%stream(size(this%recv_pe)))
288  do i = 1, size(this%recv_pe)
289  call device_stream_create_with_priority(this%stream(i), 1, strm_high_prio)
290  end do
291 
292  allocate(this%event(size(this%recv_pe)))
293  do i = 1, size(this%recv_pe)
294  call device_event_create(this%event(i), 2)
295  end do
296 #endif
297 
298 
299  this%nb_strtgy = 0
300 
301  end subroutine gs_device_mpi_init
302 
304  subroutine gs_device_mpi_free(this)
305  class(gs_device_mpi_t), intent(inout) :: this
306  integer :: i
307 
308  call this%send_buf%free()
309  call this%recv_buf%free()
310 
311  call this%free_order()
312  call this%free_dofs()
313 
314 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
315  if (allocated(this%stream)) then
316  do i = 1, size(this%stream)
317  call device_stream_destroy(this%stream(i))
318  end do
319  deallocate(this%stream)
320  end if
321 #endif
322 
323  end subroutine gs_device_mpi_free
324 
326  subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
327  class(gs_device_mpi_t), intent(inout) :: this
328  integer, intent(in) :: n
329  real(kind=rp), dimension(n), intent(inout) :: u
330  type(c_ptr), intent(inout) :: deps
331  type(c_ptr), intent(inout) :: strm
332  integer :: i
333  type(c_ptr) :: u_d
334 
335  u_d = device_get_ptr(u)
336 
337  if (iand(this%nb_strtgy, 1) .eq. 0) then
338 
339 #ifdef HAVE_HIP
340  call hip_gs_pack(u_d, &
341  this%send_buf%buf_d, &
342  this%send_buf%dof_d, &
343  0, this%send_buf%total, &
344  strm)
345 #elif HAVE_CUDA
346  call cuda_gs_pack(u_d, &
347  this%send_buf%buf_d, &
348  this%send_buf%dof_d, &
349  0, this%send_buf%total, &
350  strm)
351 #else
352  call neko_error('gs_device_mpi: no backend')
353 #endif
354 
355  call device_sync(strm)
356 
357  do i = 1, size(this%send_pe)
358  call device_mpi_isend(this%send_buf%buf_d, &
359  rp*this%send_buf%offset(i), &
360  rp*this%send_buf%ndofs(i), this%send_pe(i), &
361  this%send_buf%reqs, i)
362  end do
363 
364  else
365 
366  do i = 1, size(this%send_pe)
367  call device_stream_wait_event(this%stream(i), deps, 0)
368 #ifdef HAVE_HIP
369  call hip_gs_pack(u_d, &
370  this%send_buf%buf_d, &
371  this%send_buf%dof_d, &
372  this%send_buf%offset(i), &
373  this%send_buf%ndofs(i), &
374  this%stream(i))
375 #elif HAVE_CUDA
376  call cuda_gs_pack(u_d, &
377  this%send_buf%buf_d, &
378  this%send_buf%dof_d, &
379  this%send_buf%offset(i), &
380  this%send_buf%ndofs(i), &
381  this%stream(i))
382 #else
383  call neko_error('gs_device_mpi: no backend')
384 #endif
385  end do
386 
387  ! Consider adding a poll loop here once we have device_query in place
388  do i = 1, size(this%send_pe)
389  call device_sync(this%stream(i))
390  call device_mpi_isend(this%send_buf%buf_d, &
391  rp*this%send_buf%offset(i), &
392  rp*this%send_buf%ndofs(i), this%send_pe(i), &
393  this%send_buf%reqs, i)
394  end do
395  end if
396 
397  end subroutine gs_device_mpi_nbsend
398 
400  subroutine gs_device_mpi_nbrecv(this)
401  class(gs_device_mpi_t), intent(inout) :: this
402  integer :: i
403 
404  do i = 1, size(this%recv_pe)
405  call device_mpi_irecv(this%recv_buf%buf_d, rp*this%recv_buf%offset(i), &
406  rp*this%recv_buf%ndofs(i), this%recv_pe(i), &
407  this%recv_buf%reqs, i)
408  end do
409 
410  end subroutine gs_device_mpi_nbrecv
411 
413  subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
414  class(gs_device_mpi_t), intent(inout) :: this
415  integer, intent(in) :: n
416  real(kind=rp), dimension(n), intent(inout) :: u
417  type(c_ptr), intent(inout) :: strm
418  integer :: op, done_req, i
419  type(c_ptr) :: u_d
420 
421  u_d = device_get_ptr(u)
422 
423  if (iand(this%nb_strtgy, 2) .eq. 0) then
424  call device_mpi_waitall(size(this%recv_pe), this%recv_buf%reqs)
425 
426 #ifdef HAVE_HIP
427  call hip_gs_unpack(u_d, op, &
428  this%recv_buf%buf_d, &
429  this%recv_buf%dof_d, &
430  0, this%recv_buf%total, &
431  strm)
432 #elif HAVE_CUDA
433  call cuda_gs_unpack(u_d, op, &
434  this%recv_buf%buf_d, &
435  this%recv_buf%dof_d, &
436  0, this%recv_buf%total, &
437  strm)
438 #else
439  call neko_error('gs_device_mpi: no backend')
440 #endif
441 
442  call device_mpi_waitall(size(this%send_pe), this%send_buf%reqs)
443 
444  ! Syncing here seems to prevent some race condition
445  call device_sync(strm)
446 
447  else
448 
449  do while(device_mpi_waitany(size(this%recv_pe), &
450  this%recv_buf%reqs, done_req) .ne. 0)
451 
452 #ifdef HAVE_HIP
453  call hip_gs_unpack(u_d, op, &
454  this%recv_buf%buf_d, &
455  this%recv_buf%dof_d, &
456  this%recv_buf%offset(done_req), &
457  this%recv_buf%ndofs(done_req), &
458  this%stream(done_req))
459 #elif HAVE_CUDA
460  call cuda_gs_unpack(u_d, op, &
461  this%recv_buf%buf_d, &
462  this%recv_buf%dof_d, &
463  this%recv_buf%offset(done_req), &
464  this%recv_buf%ndofs(done_req), &
465  this%stream(done_req))
466 #else
467  call neko_error('gs_device_mpi: no backend')
468 #endif
469  call device_event_record(this%event(done_req), this%stream(done_req))
470  end do
471 
472  call device_mpi_waitall(size(this%send_pe), this%send_buf%reqs)
473 
474  ! Sync non-blocking streams
475  do done_req = 1, size(this%recv_pe)
476  call device_stream_wait_event(strm, &
477  this%event(done_req), 0)
478  end do
479 
480  end if
481 
482  end subroutine gs_device_mpi_nbwait
483 
484 end module gs_device_mpi
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Definition: gs.cu:139
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Definition: gs.cu:116
Return the device pointer for an associated Fortran array.
Definition: device.F90:81
Copy data between host and device (or device and device)
Definition: device.F90:51
Synchronize a device or stream.
Definition: device.F90:87
Definition: comm.F90:1
integer pe_rank
MPI rank.
Definition: comm.F90:28
integer pe_size
MPI size of communicator.
Definition: comm.F90:31
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition: device.F90:1210
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:185
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition: device.F90:164
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
Definition: device.F90:1123
subroutine device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
Definition: device.F90:1088
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition: device.F90:1164
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Definition: device.F90:1105
Defines a gather-scatter communication method.
Definition: gs_comm.f90:34
Defines GPU aware MPI gather-scatter communication.
subroutine gs_device_mpi_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_mpi_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_mpi_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_mpi_free(this)
Deallocate MPI based communication method.
subroutine gs_device_mpi_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_mpi_buf_free(this)
subroutine gs_device_mpi_init(this, send_pe, recv_pe)
Initialise MPI based communication method.
Defines Gather-scatter operations.
Definition: gs_ops.f90:34
Implements a hash table ADT.
Definition: htable.f90:36
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Implements a dynamic stack ADT.
Definition: stack.f90:35
Utilities.
Definition: utils.f90:35
Gather-scatter communication method.
Definition: gs_comm.f90:45
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Integer based hash table.
Definition: htable.f90:82
Integer based stack.
Definition: stack.f90:63