Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_shmem.F90
Go to the documentation of this file.
1! Copyright (c) 2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp, i8
36 use gs_comm, only : gs_comm_t
38 use stack, only : stack_i4_t
39 use comm, only : neko_comm, pe_rank, pe_size
40 use mpi_f08, only : mpi_allreduce, mpi_alltoall, mpi_integer, mpi_max
41 use utils, only : neko_error
42#ifdef HAVE_OPENSHMEM
47#endif
48 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, c_loc, &
49 c_f_pointer, c_associated, c_sizeof, c_size_t, c_int64_t
50 implicit none
51 private
52
56 integer, allocatable :: ndofs(:)
58 integer, allocatable :: offset(:)
61 integer, allocatable :: remote_offset(:)
63 integer :: total = 0
65 integer :: max_total = 0
67 type(c_ptr) :: buf_ptr = c_null_ptr
68 contains
69 procedure, pass(this) :: init => gs_shmem_buf_init
70 procedure, pass(this) :: free => gs_shmem_buf_free
71 end type gs_shmem_buf_t
72
91 type, public, extends(gs_comm_t) :: gs_shmem_t
92 type(gs_shmem_buf_t) :: send_buf
93 type(gs_shmem_buf_t) :: recv_buf
94 ! Symmetric data-arrival signals; data_signals[r] is set to `iter`
95 ! by PE r via shmem_putmem_signal_nbi when it has put data into
96 ! our recv buffer. Sized to pe_size on every PE.
97 type(c_ptr) :: data_signals_ptr = c_null_ptr
98 ! Symmetric ack signals; ack_signals[r] is set to `iter` by PE r
99 ! via shmem_uint64_atomic_set after it has consumed our most
100 ! recent put. Sized to pe_size on every PE.
101 type(c_ptr) :: ack_signals_ptr = c_null_ptr
102 ! Monotonically increasing iteration counter; sender writes this
103 ! value into the receiver's data signal slot via SHMEM_SIGNAL_SET,
104 ! and the receiver writes the same value into the sender's ack
105 ! slot after consumption.
106 integer(kind=i8) :: iter = 0
107 contains
108 procedure, pass(this) :: init => gs_shmem_init
109 procedure, pass(this) :: free => gs_shmem_free
110 procedure, pass(this) :: nbsend => gs_shmem_nbsend
111 procedure, pass(this) :: nbrecv => gs_shmem_nbrecv
112 procedure, pass(this) :: nbwait => gs_shmem_nbwait
113 end type gs_shmem_t
114
115contains
116
121 subroutine gs_shmem_buf_init(this, pe_order, dof_stack)
122 class(gs_shmem_buf_t), intent(inout) :: this
123 integer, intent(in) :: pe_order(:)
124 type(stack_i4_t), intent(inout) :: dof_stack(0:)
125 integer :: i, ierr, n
126 integer(c_size_t) :: sz
127 real(c_rp) :: rp_dummy
128#ifndef HAVE_OPENSHMEM
129 call neko_error('Neko was not built with OpenSHMEM support')
130#else
131
132 n = size(pe_order)
133
134 allocate(this%ndofs(n))
135 allocate(this%offset(n))
136 allocate(this%remote_offset(n))
137
138 do i = 1, n
139 this%remote_offset(i) = -1
140 end do
141
142 this%total = 0
143 do i = 1, n
144 this%ndofs(i) = dof_stack(pe_order(i))%size()
145 this%offset(i) = this%total
146 this%total = this%total + this%ndofs(i)
147 end do
148
149 ! OpenSHMEM symmetric memory must be allocated with the same size on
150 ! every PE.
151 call mpi_allreduce(this%total, this%max_total, 1, mpi_integer, mpi_max, &
152 neko_comm, ierr)
153
154 sz = c_sizeof(rp_dummy) * int(max(this%max_total, 1), c_size_t)
155 this%buf_ptr = shmem_malloc(sz)
156 if (.not. c_associated(this%buf_ptr)) then
157 call neko_error('shmem_malloc failed for gs_shmem buffer')
158 end if
159#endif
160 end subroutine gs_shmem_buf_init
161
163 subroutine gs_shmem_buf_free(this)
164 class(gs_shmem_buf_t), intent(inout) :: this
165
166 if (allocated(this%ndofs)) deallocate(this%ndofs)
167 if (allocated(this%offset)) deallocate(this%offset)
168 if (allocated(this%remote_offset)) deallocate(this%remote_offset)
169
170#ifdef HAVE_OPENSHMEM
171 if (c_associated(this%buf_ptr)) then
172 ! shmem_free is collective; gs_shmem_free issues a barrier before
173 ! tearing down buffers.
174 call shmem_free(this%buf_ptr)
175 end if
176#endif
177 this%buf_ptr = c_null_ptr
178 this%total = 0
179 this%max_total = 0
180
181 end subroutine gs_shmem_buf_free
182
184 subroutine gs_shmem_init(this, send_pe, recv_pe)
185 class(gs_shmem_t), intent(inout) :: this
186 type(stack_i4_t), intent(inout) :: send_pe
187 type(stack_i4_t), intent(inout) :: recv_pe
188 integer :: i, ierr
189 integer(c_size_t) :: i64_size
190 integer(c_int64_t) :: i64_dummy
191 integer, allocatable :: local_offsets(:), remote_offsets(:)
192#ifndef HAVE_OPENSHMEM
193 call neko_error('Neko was not built with OpenSHMEM support')
194#else
195
196 call this%free()
197 call this%init_order(send_pe, recv_pe)
198
199 call this%send_buf%init(this%send_pe, this%send_dof)
200 call this%recv_buf%init(this%recv_pe, this%recv_dof)
201
202 ! Allocate the per-rank symmetric signal arrays. Size pe_size on
203 ! every PE -> no Allreduce needed and no slot handshake required;
204 ! every PE writes/reads at offset = remote PE's rank.
205 i64_size = c_sizeof(i64_dummy)
206 this%data_signals_ptr = shmem_calloc(int(pe_size, c_size_t), i64_size)
207 if (.not. c_associated(this%data_signals_ptr)) then
208 call neko_error('shmem_calloc failed for gs_shmem data signals')
209 end if
210 this%ack_signals_ptr = shmem_calloc(int(pe_size, c_size_t), i64_size)
211 if (.not. c_associated(this%ack_signals_ptr)) then
212 call neko_error('shmem_calloc failed for gs_shmem ack signals')
213 end if
214
215 ! Offset exchange via Alltoall. send_pe and recv_pe are built in
216 ! gs_schedule with a shifted-modulo neighbor pattern; pairwise
217 ! Isend/Irecv keyed by neighbor rank turned out to deadlock at
218 ! certain PE counts. Alltoall is a single collective call that
219 ! cannot mismatch, and the payload is just pe_size ints per PE.
220 allocate(local_offsets(0:pe_size - 1))
221 allocate(remote_offsets(0:pe_size - 1))
222 local_offsets = -1
223 do i = 1, size(this%recv_pe)
224 local_offsets(this%recv_pe(i)) = this%recv_buf%offset(i)
225 end do
226 call mpi_alltoall(local_offsets, 1, mpi_integer, &
227 remote_offsets, 1, mpi_integer, neko_comm, ierr)
228 do i = 1, size(this%send_pe)
229 this%send_buf%remote_offset(i) = remote_offsets(this%send_pe(i))
230 end do
231 deallocate(local_offsets)
232 deallocate(remote_offsets)
233
234 this%iter = 0
235
236 ! Ensure all PEs have completed symmetric allocation before any
237 ! one-sided communication is issued.
238 call shmem_barrier_all()
239#endif
240 end subroutine gs_shmem_init
241
243 subroutine gs_shmem_free(this)
244 class(gs_shmem_t), intent(inout) :: this
245
246#ifdef HAVE_OPENSHMEM
247 ! shmem_free is collective; synchronize first to ensure no in-flight
248 ! puts target the buffers we are about to release.
249 call shmem_barrier_all()
250
251 if (c_associated(this%data_signals_ptr)) then
252 call shmem_free(this%data_signals_ptr)
253 end if
254 if (c_associated(this%ack_signals_ptr)) then
255 call shmem_free(this%ack_signals_ptr)
256 end if
257 this%data_signals_ptr = c_null_ptr
258 this%ack_signals_ptr = c_null_ptr
259 this%iter = 0
260#endif
261
262 call this%send_buf%free()
263 call this%recv_buf%free()
264
265 call this%free_order()
266 call this%free_dofs()
267
268 end subroutine gs_shmem_free
269
275 subroutine gs_shmem_nbsend(this, u, n, deps, strm)
276 class(gs_shmem_t), intent(inout) :: this
277 integer, intent(in) :: n
278 real(kind=rp), dimension(n), intent(inout) :: u
279 type(c_ptr), intent(inout) :: deps
280 type(c_ptr), intent(inout) :: strm
281 integer :: i, j, dst, base
282 integer(c_size_t) :: nbytes
283 integer , pointer :: sp(:)
284 real(kind=rp), pointer :: send_data(:), recv_data(:)
285 integer(c_int64_t), pointer :: data_signals(:), ack_signals(:)
286 real(c_rp) :: rp_dummy
287#ifdef HAVE_OPENSHMEM
288
289 ! Each gs op gets a fresh signal value so receivers can distinguish
290 ! puts from one call vs. the next.
291 this%iter = this%iter + 1
292
293 call c_f_pointer(this%send_buf%buf_ptr, send_data, &
294 [max(this%send_buf%max_total, 1)])
295 call c_f_pointer(this%recv_buf%buf_ptr, recv_data, &
296 [max(this%recv_buf%max_total, 1)])
297 call c_f_pointer(this%data_signals_ptr, data_signals, [pe_size])
298 call c_f_pointer(this%ack_signals_ptr, ack_signals, [pe_size])
299
300 do i = 1, size(this%send_pe)
301 dst = this%send_pe(i)
302
303 ! Wait for dst to have consumed our previous round's data before
304 ! we overwrite their recv buffer. ack_signals[dst] is calloc'd
305 ! to 0 and dst writes the iter value (via atomic_set) after
306 ! consumption, so for iter == 1 the wait succeeds immediately.
307 call shmem_uint64_wait_until(c_loc(ack_signals(dst + 1)), &
308 shmem_cmp_ge, this%iter - 1_8)
309
310 sp => this%send_dof(dst)%array()
311 base = this%send_buf%offset(i)
312 do concurrent(j = 1:this%send_dof(dst)%size())
313 send_data(base + j) = u(sp(j))
314 end do
315
316 nbytes = int(this%send_buf%ndofs(i), c_size_t) * c_sizeof(rp_dummy)
317 ! Put data + signal dst's data_signals[my_rank].
319 c_loc(recv_data(this%send_buf%remote_offset(i) + 1)), &
320 c_loc(send_data(base + 1)), &
321 nbytes, &
322 c_loc(data_signals(pe_rank + 1)), &
323 this%iter, shmem_signal_set, dst)
324 end do
325#endif
326 end subroutine gs_shmem_nbsend
327
329 subroutine gs_shmem_nbrecv(this)
330 class(gs_shmem_t), intent(inout) :: this
331
332 end subroutine gs_shmem_nbrecv
333
337 subroutine gs_shmem_nbwait(this, u, n, op, strm)
338 class(gs_shmem_t), intent(inout) :: this
339 integer, intent(in) :: n
340 real(kind=rp), dimension(n), intent(inout) :: u
341 type(c_ptr), intent(inout) :: strm
342 integer :: op
343 integer :: i, j, src, base
344 integer(c_int64_t) :: dummy
345 integer , pointer :: sp(:)
346 real(kind=rp), pointer :: recv_data(:)
347 integer(c_int64_t), pointer :: data_signals(:), ack_signals(:)
348#ifdef HAVE_OPENSHMEM
349
350 call c_f_pointer(this%recv_buf%buf_ptr, recv_data, &
351 [max(this%recv_buf%max_total, 1)])
352 call c_f_pointer(this%data_signals_ptr, data_signals, [pe_size])
353 call c_f_pointer(this%ack_signals_ptr, ack_signals, [pe_size])
354
355 do i = 1, size(this%recv_pe)
356 src = this%recv_pe(i)
357
358 ! Wait for data from src; data_signals[src] is set by src's put.
359 dummy = shmem_signal_wait_until(c_loc(data_signals(src + 1)), &
360 shmem_cmp_ge, this%iter)
361
362 sp => this%recv_dof(src)%array()
363 base = this%recv_buf%offset(i)
364 select case (op)
365 case (gs_op_add)
366 !NEC$ IVDEP
367 do concurrent(j = 1:this%recv_dof(src)%size())
368 u(sp(j)) = u(sp(j)) + recv_data(base + j)
369 end do
370 case (gs_op_mul)
371 !NEC$ IVDEP
372 do concurrent(j = 1:this%recv_dof(src)%size())
373 u(sp(j)) = u(sp(j)) * recv_data(base + j)
374 end do
375 case (gs_op_min)
376 !NEC$ IVDEP
377 do concurrent(j = 1:this%recv_dof(src)%size())
378 u(sp(j)) = min(u(sp(j)), recv_data(base + j))
379 end do
380 case (gs_op_max)
381 !NEC$ IVDEP
382 do concurrent(j = 1:this%recv_dof(src)%size())
383 u(sp(j)) = max(u(sp(j)), recv_data(base + j))
384 end do
385 case default
386 call neko_error("Unknown operation in gs_nbwait_shmem")
387 end select
388
389 ! Tell src we are done with this round's buffer so they may
390 ! overwrite it on the next round. We set our own rank's slot
391 ! in src's ack_signals; src waits there on the next nbsend.
393 c_loc(ack_signals(pe_rank + 1)), &
394 this%iter, src)
395 end do
396#endif
397 end subroutine gs_shmem_nbwait
398
399end module gs_shmem
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:60
integer, public pe_rank
MPI rank.
Definition comm.F90:57
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:44
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines Gather-scatter operations.
Definition gs_ops.f90:34
integer, parameter, public gs_op_add
Definition gs_ops.f90:36
integer, parameter, public gs_op_max
Definition gs_ops.f90:36
integer, parameter, public gs_op_min
Definition gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition gs_ops.f90:36
Defines OpenSHMEM gather-scatter communication.
Definition gs_shmem.F90:34
subroutine gs_shmem_buf_init(this, pe_order, dof_stack)
Allocate symmetric memory and per-neighbor bookkeeping for one direction of communication.
Definition gs_shmem.F90:122
subroutine gs_shmem_init(this, send_pe, recv_pe)
Initialise OpenSHMEM based communication method.
Definition gs_shmem.F90:185
subroutine gs_shmem_buf_free(this)
Release symmetric memory and bookkeeping.
Definition gs_shmem.F90:164
subroutine gs_shmem_nbrecv(this)
No-op: receives are completed via remote put-with-signal.
Definition gs_shmem.F90:330
subroutine gs_shmem_free(this)
Deallocate OpenSHMEM based communication method.
Definition gs_shmem.F90:244
subroutine gs_shmem_nbwait(this, u, n, op, strm)
Wait per-neighbor for the signal indicating that data has landed, apply the gather-scatter operation ...
Definition gs_shmem.F90:338
subroutine gs_shmem_nbsend(this, u, n, deps, strm)
Pack the gathered shared dofs into the symmetric send buffer and issue non-blocking puts with signali...
Definition gs_shmem.F90:276
integer, parameter, public i8
Definition num_types.f90:7
integer, parameter, public sp
Definition num_types.f90:8
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Fortran bindings to SHMEM's C API.
Definition shmem.F90:34
@ shmem_signal_set
Definition shmem.F90:79
@ shmem_cmp_ge
Definition shmem.F90:50
Implements a dynamic stack ADT.
Definition stack.f90:49
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:47
Symmetric buffer for one direction of OpenSHMEM communication.
Definition gs_shmem.F90:54
Gather-scatter communication using OpenSHMEM one-sided puts with per-rank signaling for completion (O...
Definition gs_shmem.F90:91
Integer based stack.
Definition stack.f90:77
#define max(a, b)
Definition tensor.cu:40