Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device_nccl.F90
Go to the documentation of this file.
1! Copyright (c) 2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_comm, only : gs_comm_t
37 use gs_ops
38 use stack, only : stack_i4_t
39 use comm, only : pe_size, pe_rank
40 use htable, only : htable_i4_t
41 use device
42 use utils, only : neko_error
43 use, intrinsic :: iso_c_binding, only : c_sizeof, c_int32_t, &
44 c_ptr, c_null_ptr, c_size_t, c_associated, c_int
45 implicit none
46 private
47
49 type, private :: gs_device_nccl_buf_t
50 integer, allocatable :: ndofs(:)
51 integer, allocatable :: offset(:)
52 integer :: total
53 type(c_ptr) :: buf_d = c_null_ptr
54 type(c_ptr) :: dof_d = c_null_ptr
55 contains
56 procedure, pass(this) :: init => gs_device_nccl_buf_init
57 procedure, pass(this) :: free => gs_device_nccl_buf_free
59
62 type, public, extends(gs_comm_t) :: gs_device_nccl_t
63 type(gs_device_nccl_buf_t) :: send_buf
64 type(gs_device_nccl_buf_t) :: recv_buf
65 type(c_ptr), allocatable :: stream(:)
66 type(c_ptr), allocatable :: event(:)
67 integer :: nb_strtgy
68 type(c_ptr) :: send_event = c_null_ptr
69 contains
70 procedure, pass(this) :: init => gs_device_nccl_init
71 procedure, pass(this) :: free => gs_device_nccl_free
72 procedure, pass(this) :: nbsend => gs_device_nccl_nbsend
73 procedure, pass(this) :: nbrecv => gs_device_nccl_nbrecv
74 procedure, pass(this) :: nbwait => gs_device_nccl_nbwait
75 end type gs_device_nccl_t
76
77#ifdef HAVE_HIP
78 interface
79 subroutine hip_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
80 bind(c, name='hip_gs_pack')
81 use, intrinsic :: iso_c_binding
82 implicit none
83 integer(c_int), value :: n, offset
84 type(c_ptr), value :: u_d, buf_d, dof_d, stream
85 end subroutine hip_gs_pack
86 end interface
87
88 interface
89 subroutine hip_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
90 bind(c, name='hip_gs_unpack')
91 use, intrinsic :: iso_c_binding
92 implicit none
93 integer(c_int), value :: op, offset, n
94 type(c_ptr), value :: u_d, buf_d, dof_d, stream
95 end subroutine hip_gs_unpack
96 end interface
97#elif HAVE_CUDA
98 interface
99 subroutine cuda_gs_pack(u_d, buf_d, dof_d, offset, n, stream) &
100 bind(c, name='cuda_gs_pack')
101 use, intrinsic :: iso_c_binding
102 implicit none
103 integer(c_int), value :: n, offset
104 type(c_ptr), value :: u_d, buf_d, dof_d, stream
105 end subroutine cuda_gs_pack
106 end interface
107
108 interface
109 subroutine cuda_gs_unpack(u_d, op, buf_d, dof_d, offset, n, stream) &
110 bind(c, name='cuda_gs_unpack')
111 use, intrinsic :: iso_c_binding
112 implicit none
113 integer(c_int), value :: op, offset, n
114 type(c_ptr), value :: u_d, buf_d, dof_d, stream
115 end subroutine cuda_gs_unpack
116 end interface
117#endif
118
119 interface
120 subroutine device_nccl_sendrecv(sbuf_d, soffset, scount, srank, &
121 rbuf_d, roffset, rcount, rrank, nbytes, stream) &
122 bind(c, name='device_nccl_sendrecv')
123 use, intrinsic :: iso_c_binding
124 implicit none
125 integer(c_int), value :: soffset, scount, roffset, rcount
126 integer(c_int), value :: srank, rrank, nbytes
127 type(c_ptr), value :: sbuf_d, rbuf_d, stream
128 end subroutine device_nccl_sendrecv
129 end interface
130
131contains
132
133 subroutine gs_device_nccl_buf_init(this, pe_order, dof_stack, mark_dupes)
134 class(gs_device_nccl_buf_t), intent(inout) :: this
135 integer, allocatable, intent(inout) :: pe_order(:)
136 type(stack_i4_t), allocatable, intent(inout) :: dof_stack(:)
137 logical, intent(in) :: mark_dupes
138 integer, allocatable :: dofs(:)
139 integer :: i, j, total
140 integer(c_size_t) :: sz
141 type(htable_i4_t) :: doftable
142 integer :: dupe, marked, k
143 real(c_rp) :: rp_dummy
144 integer(c_int32_t) :: i4_dummy
145
146
147 allocate(this%ndofs(size(pe_order)))
148 allocate(this%offset(size(pe_order)))
149
150 total = 0
151 do i = 1, size(pe_order)
152 this%ndofs(i) = dof_stack(pe_order(i))%size()
153 this%offset(i) = total
154 total = total + this%ndofs(i)
155 end do
156
157 this%total = total
158
159 sz = c_sizeof(rp_dummy) * total
160 call device_alloc(this%buf_d, sz)
161
162 sz = c_sizeof(i4_dummy) * total
163 call device_alloc(this%dof_d, sz)
164
165 if (mark_dupes) call doftable%init(2*total)
166 allocate(dofs(total))
167
168 ! Copy from dof_stack into dofs, optionally marking duplicates with doftable
169 marked = 0
170 do i = 1, size(pe_order)
171 ! %array() breaks on cray
172 select type (arr => dof_stack(pe_order(i))%data)
173 type is (integer)
174 do j = 1, this%ndofs(i)
175 k = this%offset(i) + j
176 if (mark_dupes) then
177 if (doftable%get(arr(j), dupe) .eq. 0) then
178 if (dofs(dupe) .gt. 0) then
179 dofs(dupe) = -dofs(dupe)
180 marked = marked + 1
181 end if
182 dofs(k) = -arr(j)
183 marked = marked + 1
184 else
185 call doftable%set(arr(j), k)
186 dofs(k) = arr(j)
187 end if
188 else
189 dofs(k) = arr(j)
190 end if
191 end do
192 end select
193 end do
194
195 call device_memcpy(dofs, this%dof_d, total, host_to_device, sync=.true.)
196
197 deallocate(dofs)
198 call doftable%free()
199
200 end subroutine gs_device_nccl_buf_init
201
202 subroutine gs_device_nccl_buf_free(this)
203 class(gs_device_nccl_buf_t), intent(inout) :: this
204
205 if (allocated(this%ndofs)) deallocate(this%ndofs)
206 if (allocated(this%offset)) deallocate(this%offset)
207
208 if (c_associated(this%buf_d)) call device_free(this%buf_d)
209 if (c_associated(this%dof_d)) call device_free(this%dof_d)
210 end subroutine gs_device_nccl_buf_free
211
213 subroutine gs_device_nccl_init(this, send_pe, recv_pe)
214 class(gs_device_nccl_t), intent(inout) :: this
215 type(stack_i4_t), intent(inout) :: send_pe
216 type(stack_i4_t), intent(inout) :: recv_pe
217 integer :: i
218
219 call this%init_order(send_pe, recv_pe)
220
221 call this%send_buf%init(this%send_pe, this%send_dof, .false.)
222 call this%recv_buf%init(this%recv_pe, this%recv_dof, .true.)
223
224#if defined(HAVE_HIP) || defined(HAVE_CUDA)
225 ! Create a set of non-blocking streams
226 allocate(this%stream(size(this%recv_pe)))
227 do i = 1, size(this%recv_pe)
229 end do
230
231 allocate(this%event(size(this%recv_pe)))
232 do i = 1, size(this%recv_pe)
233 call device_event_create(this%event(i), 2)
234 end do
235#endif
236
237 end subroutine gs_device_nccl_init
238
240 subroutine gs_device_nccl_free(this)
241 class(gs_device_nccl_t), intent(inout) :: this
242 integer :: i
243
244 call this%send_buf%free()
245 call this%recv_buf%free()
246
247 call this%free_order()
248 call this%free_dofs()
249
250#if defined(HAVE_HIP) || defined(HAVE_CUDA)
251 if (allocated(this%stream)) then
252 do i = 1, size(this%stream)
253 call device_stream_destroy(this%stream(i))
254 end do
255 deallocate(this%stream)
256 end if
257#endif
258
259 end subroutine gs_device_nccl_free
260
262 subroutine gs_device_nccl_nbsend(this, u, n, deps, strm)
263 class(gs_device_nccl_t), intent(inout) :: this
264 integer, intent(in) :: n
265 real(kind=rp), dimension(n), intent(inout) :: u
266 type(c_ptr), intent(inout) :: deps
267 type(c_ptr), intent(inout) :: strm
268 integer :: i
269 type(c_ptr) :: u_d
270
271 u_d = device_get_ptr(u)
272
273 do i = 1, size(this%send_pe)
274 call device_stream_wait_event(this%stream(i), deps, 0)
275#ifdef HAVE_HIP
276 call hip_gs_pack(u_d, &
277 this%send_buf%buf_d, &
278 this%send_buf%dof_d, &
279 this%send_buf%offset(i), &
280 this%send_buf%ndofs(i), &
281 this%stream(i))
282#elif HAVE_CUDA
283 call cuda_gs_pack(u_d, &
284 this%send_buf%buf_d, &
285 this%send_buf%dof_d, &
286 this%send_buf%offset(i), &
287 this%send_buf%ndofs(i), &
288 this%stream(i))
289#else
290 call neko_error('gs_device_nccl: no backend')
291#endif
292 end do
293
294
295 ! Everything else is done in the wait routine
296
297 end subroutine gs_device_nccl_nbsend
298
300 subroutine gs_device_nccl_nbrecv(this)
301 class(gs_device_nccl_t), intent(inout) :: this
302 integer :: i
303
304 ! Everything is done in the wait routine
305
306 end subroutine gs_device_nccl_nbrecv
307
309 subroutine gs_device_nccl_nbwait(this, u, n, op, strm)
310 class(gs_device_nccl_t), intent(inout) :: this
311 integer, intent(in) :: n
312 real(kind=rp), dimension(n), intent(inout) :: u
313 type(c_ptr), intent(inout) :: strm
314 integer :: op, done_req, i
315 type(c_ptr) :: u_d
316 real(c_rp) :: rp_dummy
317 integer(c_int) :: nbytes
318
319 u_d = device_get_ptr(u)
320 nbytes = c_sizeof(rp_dummy)
321
322 do i = 1, size(this%send_pe)
323
324 call device_nccl_sendrecv(this%send_buf%buf_d, &
325 nbytes*this%send_buf%offset(i), &
326 this%send_buf%ndofs(i), &
327 this%send_pe(i), &
328 this%recv_buf%buf_d, &
329 nbytes*this%recv_buf%offset(i), &
330 this%recv_buf%ndofs(i), &
331 this%recv_pe(i), &
332 nbytes, &
333 this%stream(i))
334
335#ifdef HAVE_HIP
336 call hip_gs_unpack(u_d, op, &
337 this%recv_buf%buf_d, &
338 this%recv_buf%dof_d, &
339 this%recv_buf%offset(i), &
340 this%recv_buf%ndofs(i), &
341 this%stream(i))
342#elif HAVE_CUDA
343 call cuda_gs_unpack(u_d, op, &
344 this%recv_buf%buf_d, &
345 this%recv_buf%dof_d, &
346 this%recv_buf%offset(i), &
347 this%recv_buf%ndofs(i), &
348 this%stream(i))
349#else
350 call neko_error('gs_device_mpi: no backend')
351#endif
352 call device_event_record(this%event(i), this%stream(i))
353 end do
354
355 ! Sync non-blocking streams
356 do done_req = 1, size(this%recv_pe)
357 call device_stream_wait_event(strm, &
358 this%event(done_req), 0)
359 end do
360
361 end subroutine gs_device_nccl_nbwait
362
363end module gs_device_nccl
void cuda_gs_unpack(real *u_d, int op, real *buf_d, int *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:132
void cuda_gs_pack(void *u_d, void *buf_d, void *dof_d, int offset, int n, cudaStream_t stream)
Definition gs.cu:116
Return the device pointer for an associated Fortran array.
Definition device.F90:96
Copy data between host and device (or device and device)
Definition device.F90:66
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:58
integer, public pe_rank
MPI rank.
Definition comm.F90:55
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1290
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:214
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:187
subroutine, public device_stream_create_with_priority(stream, flags, prio)
Create a device stream/command queue with priority.
Definition device.F90:1168
subroutine, public device_stream_wait_event(stream, event, flags)
Synchronize a device stream with an event.
Definition device.F90:1203
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1244
integer, public strm_high_prio
High priority stream setting.
Definition device.F90:60
subroutine, public device_stream_destroy(stream)
Destroy a device stream/command queue.
Definition device.F90:1185
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines NCCL based gather-scatter communication.
subroutine gs_device_nccl_nbsend(this, u, n, deps, strm)
Post non-blocking send operations.
subroutine gs_device_nccl_init(this, send_pe, recv_pe)
Initialise NCCL based communication method.
subroutine gs_device_nccl_nbwait(this, u, n, op, strm)
Wait for non-blocking operations.
subroutine gs_device_nccl_buf_init(this, pe_order, dof_stack, mark_dupes)
subroutine gs_device_nccl_nbrecv(this)
Post non-blocking receive operations.
subroutine gs_device_nccl_free(this)
Deallocate MPI based communication method.
subroutine gs_device_nccl_buf_free(this)
Defines Gather-scatter operations.
Definition gs_ops.f90:34
Implements a hash table ADT.
Definition htable.f90:36
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Implements a dynamic stack ADT.
Definition stack.f90:35
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:46
Buffers for non-blocking communication and packing/unpacking.
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Integer based hash table.
Definition htable.f90:82
Integer based stack.
Definition stack.f90:63