Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device_nccl.F90
Go to the documentation of this file.
1! Copyright (c) 2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_comm, only : gs_comm_t
37 use stack, only : stack_i4_t
38 use comm, only : pe_size, pe_rank
39 use htable, only : htable_i4_t
40 use device
41 use utils, only : neko_error
42 use, intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
43 c_ptr, c_null_ptr, c_size_t, c_associated, c_int
44 implicit none
45 private
46
48 type, private :: gs_device_nccl_buf_t
49 integer, allocatable :: ndofs(:)
50 integer, allocatable :: offset(:)
51 integer :: total
52 type(c_ptr) :: buf_d = c_null_ptr
53 type(c_ptr) :: dof_d = c_null_ptr
54 contains
55 procedure, pass(this) :: init => gs_device_nccl_buf_init
56 procedure, pass(this) :: free => gs_device_nccl_buf_free
58
61 type, public, extends(gs_comm_t) :: gs_device_nccl_t
62 type(gs_device_nccl_buf_t) :: send_buf
63 type(gs_device_nccl_buf_t) :: recv_buf
64 type(c_ptr), allocatable :: stream(:)
65 type(c_ptr), allocatable :: event(:)
66 integer :: nb_strtgy
67 type(c_ptr) :: send_event = c_null_ptr
68 contains
69 procedure, pass(this) :: init => gs_device_nccl_init
70 procedure, pass(this) :: free => gs_device_nccl_free
71 procedure, pass(this) :: nbsend => gs_device_nccl_nbsend
72 procedure, pass(this) :: nbrecv => gs_device_nccl_nbrecv
73 procedure, pass(this) :: nbwait => gs_device_nccl_nbwait
74 end type gs_device_nccl_t
75
76#ifdef HAVE_HIP
77 interface
78 subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
79 bind(c, name = 'hip_gs_pack')
80 use, intrinsic :: iso_c_binding
81 implicit none
82 integer(c_int), value :: n, offset
83 type(c_ptr), value :: u_d, buf_d, dof_d, stream
84 end subroutine hip_gs_pack
85 end interface
86
87 interface
88 subroutine hip_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
89 bind(c, name = 'hip_gs_unpack')
90 use, intrinsic :: iso_c_binding
91 implicit none
92 integer(c_int), value :: op, offset, n
93 type(c_ptr), value :: u_d, buf_d, dof_d, stream
94 end subroutine hip_gs_unpack
95 end interface
96#elif HAVE_CUDA
97 interface
98 subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
99 bind(c, name = 'cuda_gs_pack')
100 use, intrinsic :: iso_c_binding
101 implicit none
102 integer(c_int), value :: n, offset
103 type(c_ptr), value :: u_d, buf_d, dof_d, stream
104 end subroutine cuda_gs_pack
105 end interface
106
107 interface
108 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
109 bind(c, name = 'cuda_gs_unpack')
110 use, intrinsic :: iso_c_binding
111 implicit none
112 integer(c_int), value :: op, offset, n
113 type(c_ptr), value :: u_d, buf_d, dof_d, stream
114 end subroutine cuda_gs_unpack
115 end interface
116#endif
117
118 interface
119 subroutine device_nccl_sendrecv(sbuf_d, soffset, scount, srank, &
120 rbuf_d, roffset, rcount, rrank, nbytes, stream) &
121 bind(c, name = 'device_nccl_sendrecv')
122 use, intrinsic :: iso_c_binding
123 implicit none
124 integer(c_int), value :: soffset, scount, roffset, rcount
125 integer(c_int), value :: srank, rrank, nbytes
126 type(c_ptr), value :: sbuf_d, rbuf_d, stream
127 end subroutine device_nccl_sendrecv
128 end interface
129
130contains
131
132 subroutine gs_device_nccl_buf_init(this, pe_order, dof_stack, mark_dupes)
133 class(gs_device_nccl_buf_t), intent(inout) :: this
134 integer, allocatable, intent(inout) :: pe_order(:)
135 type(stack_i4_t), allocatable, intent(inout) :: dof_stack(:)
136 logical, intent(in) :: mark_dupes
137 integer, allocatable :: dofs(:)
138 integer :: i, j, total
139 integer(c_size_t) :: sz
140 type(htable_i4_t) :: doftable
141 integer :: dupe, marked, k
142 real(c_rp) :: rp_dummy
143 integer(c_int32_t) :: i4_dummy
144
145
146 allocate(this%ndofs(size(pe_order)))
147 allocate(this%offset(size(pe_order)))
148
149 total = 0
150 do i = 1, size(pe_order)
151 this%ndofs(i) = dof_stack(pe_order(i))%size()
152 this%offset(i) = total
153 total = total + this%ndofs(i)
154 end do
155
156 this%total = total
157
158 sz = c_sizeof(rp_dummy) * total
159 call device_alloc(this%buf_d, sz)
160
161 sz = c_sizeof(i4_dummy) * total
162 call device_alloc(this%dof_d, sz)
163
164 if (mark_dupes) call doftable%init(2*total)
165 allocate(dofs(total))
166
167 ! Copy from dof_stack into dofs, optionally marking duplicates with doftable
168 marked = 0
169 do i = 1, size(pe_order)
170 ! %array() breaks on cray
171 select type (arr => dof_stack(pe_order(i))%data)
172 type is (integer)
173 do j = 1, this%ndofs(i)
174 k = this%offset(i) + j
175 if (mark_dupes) then
176 if (doftable%get(arr(j), dupe) .eq. 0) then
177 if (dofs(dupe) .gt. 0) then
178 dofs(dupe) = -dofs(dupe)
179 marked = marked + 1
180 end if
181 dofs(k) = -arr(j)
182 marked = marked + 1
183 else
184 call doftable%set(arr(j), k)
185 dofs(k) = arr(j)
186 end if
187 else
188 dofs(k) = arr(j)
189 end if
190 end do
191 end select
192 end do
193
194 call device_memcpy(dofs, this%dof_d, total, host_to_device, sync = .true.)
195
196 deallocate(dofs)
197 call doftable%free()
198
199 end subroutine gs_device_nccl_buf_init
200
201 subroutine gs_device_nccl_buf_free(this)
202 class(gs_device_nccl_buf_t), intent(inout) :: this
203
204 if (allocated(this%ndofs)) deallocate(this%ndofs)
205 if (allocated(this%offset)) deallocate(this%offset)
206
207 if (c_associated(this%buf_d)) call device_free(this%buf_d)
208 if (c_associated(this%dof_d)) call device_free(this%dof_d)
209 end subroutine gs_device_nccl_buf_free
210
212 subroutine gs_device_nccl_init(this, send_pe, recv_pe)
213 class(gs_device_nccl_t), intent(inout) :: this
214 type(stack_i4_t), intent(inout) :: send_pe
215 type(stack_i4_t), intent(inout) :: recv_pe
216 integer :: i
217
218 call this%init_order(send_pe, recv_pe)
219
220 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
221 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
222
223#if defined(HAVE_HIP) || defined(HAVE_CUDA)
224 ! Create a set of non-blocking streams
225 allocate(this%stream(size(this%recv_pe)))
226 do i = 1, size(this%recv_pe)
227 call device_stream_create_with_priority(this%stream(i), 1, &
229 end do
230
231 allocate(this%event(size(this%recv_pe)))
232 do i = 1, size(this%recv_pe)
233 call device_event_create(this%event(i), 2)
234 end do
235#endif
236
237 end subroutine gs_device_nccl_init
238
240 subroutine gs_device_nccl_free(this)
241 class(gs_device_nccl_t), intent(inout) :: this
242 integer :: i
243
244 call this%send_buf%free()
245 call this%recv_buf%free()
246
247 call this%free_order()
248 call this%free_dofs()
249
250#if defined(HAVE_HIP) || defined(HAVE_CUDA)
251 if (allocated(this%stream)) then
252 do i = 1, size(this%stream)
253 call device_stream_destroy(this%stream(i))
254 end do
255 deallocate(this%stream)
256 end if
257#endif
258
259 end subroutine gs_device_nccl_free
260
262 subroutine gs_device_nccl_nbsend(this, u, n, deps, strm)
263 class(gs_device_nccl_t), intent(inout) :: this
264 integer, intent(in) :: n
265 real(kind=rp), dimension(n), intent(inout) :: u
266 type(c_ptr), intent(inout) :: deps
267 type(c_ptr), intent(inout) :: strm
268 integer :: i
269 type(c_ptr) :: u_d
270
271 u_d = device_get_ptr(u)
272
273 do i = 1, size(this%send_pe)
274 call device_stream_wait_event(this%stream(i), deps, 0)
275#ifdef HAVE_HIP
276 call hip_gs_pack(u_d, &
277 this%send_buf%buf_d, &
278 this%send_buf%dof_d, &
279 this%send_buf%offset(i), &
280 this%send_buf%ndofs(i), &
281 this%stream(i))
282#elif HAVE_CUDA
283 call cuda_gs_pack(u_d, &
284 this%send_buf%buf_d, &
285 this%send_buf%dof_d, &
286 this%send_buf%offset(i), &
287 this%send_buf%ndofs(i), &
288 this%stream(i))
289#else
290 call neko_error('gs_device_nccl: no backend')
291#endif
292 end do
293
294
295 ! Everything else is done in the wait routine
296
297 end subroutine gs_device_nccl_nbsend
298
300 subroutine gs_device_nccl_nbrecv(this)
301 class(gs_device_nccl_t), intent(inout) :: this
302 integer :: i
303
304 ! Everything is done in the wait routine
305
306 end subroutine gs_device_nccl_nbrecv
307
309 subroutine gs_device_nccl_nbwait(this, u, n, op, strm)
310 class(gs_device_nccl_t), intent(inout) :: this
311 integer, intent(in) :: n
312 real(kind=rp), dimension(n), intent(inout) :: u
313 type(c_ptr), intent(inout) :: strm
314 integer :: op, done_req, i
315 type(c_ptr) :: u_d
316 real(c_rp) :: rp_dummy
317 integer(c_int) :: nbytes
318
319 u_d = device_get_ptr(u)
320 nbytes = c_sizeof(rp_dummy)
321
322 do i = 1, size(this%send_pe)
323
324 call device_nccl_sendrecv(this%send_buf%buf_d, &
325 nbytes*this%send_buf%offset(i), &
326 this%send_buf%ndofs(i), &
327 this%send_pe(i), &
328 this%recv_buf%buf_d, &
329 nbytes*this%recv_buf%offset(i), &
330 this%recv_buf%ndofs(i), &
331 this%recv_pe(i), &
332 nbytes, &
333 this%stream(i))
334
335#ifdef HAVE_HIP
336 call hip_gs_unpack(u_d, op, &
337 this%recv_buf%buf_d, &
338 this%recv_buf%dof_d, &
339 this%recv_buf%offset(i), &
340 this%recv_buf%ndofs(i), &
341 this%stream(i))
342#elif HAVE_CUDA
343 call cuda_gs_unpack(u_d, op, &
344 this%recv_buf%buf_d, &
345 this%recv_buf%dof_d, &
346 this%recv_buf%offset(i), &
347 this%recv_buf%ndofs(i), &
348 this%stream(i))
349#else
350 call neko_error('gs_device_mpi: no backend')
351#endif
352 call device_event_record(this%event(i), this%stream(i))
353 end do
354
355 ! Sync non-blocking streams
356 do done_req = 1, size(this%recv_pe)
357 call device_stream_wait_event(strm, &
358 this%event(done_req), 0)
359 end do
360
361 end subroutine gs_device_nccl_nbwait
362
363end module gs_device_nccl
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:132
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:116
Return the device pointer for an associated Fortran array.
Definition device.F90:101
Copy data between host and device (or device and device)
Definition device.F90:71
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
integer, public pe_rank
MPI rank.
Definition comm.F90:56
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1295
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:192
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
Definition device.F90:1173
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
Definition device.F90:1208
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
integer, public strm_high_prio
High priority stream setting.
Definition device.F90:65
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Definition device.F90:1190
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines NCCL based gather-scatter communication.
subroutine gs_device_nccl_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_nccl_init(this, send_pe, recv_pe)
Initialise NCCL based communication method.
subroutine gs_device_nccl_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_nccl_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_nccl_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_nccl_free(this)
Deallocate MPI based communication method.
subroutine gs_device_nccl_buf_free(this)
Implements a hash table ADT.
Definition htable.f90:36
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Implements a dynamic stack ADT.
Definition stack.f90:35
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:46
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Integer based hash table.
Definition htable.f90:82
Integer based stack.
Definition stack.f90:63