Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_shmem.F90
Go to the documentation of this file.
1! Copyright (c) 2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp, i8
36 use gs_comm, only : gs_comm_t
38 use stack, only : stack_i4_t
39 use comm, only : neko_comm, pe_rank, pe_size
40 use mpi_f08, only : mpi_allreduce, mpi_alltoall, mpi_integer, mpi_max
41 use utils, only : neko_error
42#ifdef HAVE_OPENSHMEM
47#endif
48 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, c_loc, &
49 c_f_pointer, c_associated, c_sizeof, c_size_t, c_int64_t
50 implicit none
51 private
52
56 integer, allocatable :: ndofs(:)
58 integer, allocatable :: offset(:)
61 integer, allocatable :: remote_offset(:)
63 integer :: total = 0
65 integer :: max_total = 0
67 type(c_ptr) :: buf_ptr = c_null_ptr
68 contains
69 procedure, pass(this) :: init => gs_shmem_buf_init
70 procedure, pass(this) :: free => gs_shmem_buf_free
71 end type gs_shmem_buf_t
72
91 type, public, extends(gs_comm_t) :: gs_shmem_t
92 type(gs_shmem_buf_t) :: send_buf
93 type(gs_shmem_buf_t) :: recv_buf
94 ! Symmetric data-arrival signals; data_signals[r] is set to `iter`
95 ! by PE r via shmem_putmem_signal_nbi when it has put data into
96 ! our recv buffer. Sized to pe_size on every PE.
97 type(c_ptr) :: data_signals_ptr = c_null_ptr
98 ! Symmetric ack signals; ack_signals[r] is set to `iter` by PE r
99 ! via shmem_uint64_atomic_set after it has consumed our most
100 ! recent put. Sized to pe_size on every PE.
101 type(c_ptr) :: ack_signals_ptr = c_null_ptr
102 ! Monotonically increasing iteration counter; sender writes this
103 ! value into the receiver's data signal slot via SHMEM_SIGNAL_SET,
104 ! and the receiver writes the same value into the sender's ack
105 ! slot after consumption.
106 integer(kind=i8) :: iter = 0
107 contains
108 procedure, pass(this) :: init => gs_shmem_init
109 procedure, pass(this) :: free => gs_shmem_free
110 procedure, pass(this) :: nbsend => gs_shmem_nbsend
111 procedure, pass(this) :: nbrecv => gs_shmem_nbrecv
112 procedure, pass(this) :: nbwait => gs_shmem_nbwait
113 end type gs_shmem_t
114
115contains
116
121 subroutine gs_shmem_buf_init(this, pe_order, dof_stack)
122 class(gs_shmem_buf_t), intent(inout) :: this
123 integer, intent(in) :: pe_order(:)
124 type(stack_i4_t), intent(inout) :: dof_stack(0:)
125 integer :: i, ierr, n
126 integer(c_size_t) :: sz
127 real(c_rp) :: rp_dummy
128#ifndef HAVE_OPENSHMEM
129 call neko_error('Neko was not built with OpenSHMEM support')
130#else
131
132 n = size(pe_order)
133
134 allocate(this%ndofs(n))
135 allocate(this%offset(n))
136 allocate(this%remote_offset(n))
137
138 do i = 1, n
139 this%remote_offset(i) = -1
140 end do
141
142 this%total = 0
143 do i = 1, n
144 this%ndofs(i) = dof_stack(pe_order(i))%size()
145 this%offset(i) = this%total
146 this%total = this%total + this%ndofs(i)
147 end do
148
149 ! OpenSHMEM symmetric memory must be allocated with the same size on
150 ! every PE.
151 call mpi_allreduce(this%total, this%max_total, 1, mpi_integer, mpi_max, &
152 neko_comm, ierr)
153
154 sz = c_sizeof(rp_dummy) * int(max(this%max_total, 1), c_size_t)
155 this%buf_ptr = shmem_malloc(sz)
156 if (.not. c_associated(this%buf_ptr)) then
157 call neko_error('shmem_malloc failed for gs_shmem buffer')
158 end if
159#endif
160 end subroutine gs_shmem_buf_init
161
163 subroutine gs_shmem_buf_free(this)
164 class(gs_shmem_buf_t), intent(inout) :: this
165
166 if (allocated(this%ndofs)) deallocate(this%ndofs)
167 if (allocated(this%offset)) deallocate(this%offset)
168 if (allocated(this%remote_offset)) deallocate(this%remote_offset)
169
170#ifdef HAVE_OPENSHMEM
171 if (c_associated(this%buf_ptr)) then
172 ! shmem_free is collective; gs_shmem_free issues a barrier before
173 ! tearing down buffers.
174 call shmem_free(this%buf_ptr)
175 end if
176#endif
177 this%buf_ptr = c_null_ptr
178 this%total = 0
179 this%max_total = 0
180
181 end subroutine gs_shmem_buf_free
182
184 subroutine gs_shmem_init(this, send_pe, recv_pe)
185 class(gs_shmem_t), intent(inout) :: this
186 type(stack_i4_t), intent(inout) :: send_pe
187 type(stack_i4_t), intent(inout) :: recv_pe
188 integer :: i, ierr
189 integer(c_size_t) :: i64_size
190 integer(c_int64_t) :: i64_dummy
191 integer, allocatable :: local_offsets(:), remote_offsets(:)
192#ifndef HAVE_OPENSHMEM
193 call neko_error('Neko was not built with OpenSHMEM support')
194#else
195
196 call this%free()
197 call this%init_order(send_pe, recv_pe)
198
199 call this%send_buf%init(this%send_pe, this%send_dof)
200 call this%recv_buf%init(this%recv_pe, this%recv_dof)
201
202 ! Allocate the per-rank symmetric signal arrays. Size pe_size on
203 ! every PE -> no Allreduce needed and no slot handshake required;
204 ! every PE writes/reads at offset = remote PE's rank.
205 i64_size = c_sizeof(i64_dummy)
206 this%data_signals_ptr = shmem_calloc(int(pe_size, c_size_t), i64_size)
207 if (.not. c_associated(this%data_signals_ptr)) then
208 call neko_error('shmem_calloc failed for gs_shmem data signals')
209 end if
210 this%ack_signals_ptr = shmem_calloc(int(pe_size, c_size_t), i64_size)
211 if (.not. c_associated(this%ack_signals_ptr)) then
212 call neko_error('shmem_calloc failed for gs_shmem ack signals')
213 end if
214
215 ! Offset exchange via Alltoall. send_pe and recv_pe are built in
216 ! gs_schedule with a shifted-modulo neighbor pattern; pairwise
217 ! Isend/Irecv keyed by neighbor rank turned out to deadlock at
218 ! certain PE counts. Alltoall is a single collective call that
219 ! cannot mismatch, and the payload is just pe_size ints per PE.
220 allocate(local_offsets(0:pe_size - 1))
221 allocate(remote_offsets(0:pe_size - 1))
222 local_offsets = -1
223 do i = 1, size(this%recv_pe)
224 local_offsets(this%recv_pe(i)) = this%recv_buf%offset(i)
225 end do
226 call mpi_alltoall(local_offsets, 1, mpi_integer, &
227 remote_offsets, 1, mpi_integer, neko_comm, ierr)
228 do i = 1, size(this%send_pe)
229 this%send_buf%remote_offset(i) = remote_offsets(this%send_pe(i))
230 end do
231 deallocate(local_offsets)
232 deallocate(remote_offsets)
233
234 this%iter = 0
235
236 ! Ensure all PEs have completed symmetric allocation before any
237 ! one-sided communication is issued.
238 call shmem_barrier_all()
239#endif
240 end subroutine gs_shmem_init
241
243 subroutine gs_shmem_free(this)
244 class(gs_shmem_t), intent(inout) :: this
245
246#ifdef HAVE_OPENSHMEM
247 ! shmem_free is collective; synchronize first to ensure no in-flight
248 ! puts target the buffers we are about to release.
249 call shmem_barrier_all()
250
251 if (c_associated(this%data_signals_ptr)) then
252 call shmem_free(this%data_signals_ptr)
253 end if
254 if (c_associated(this%ack_signals_ptr)) then
255 call shmem_free(this%ack_signals_ptr)
256 end if
257 this%data_signals_ptr = c_null_ptr
258 this%ack_signals_ptr = c_null_ptr
259 this%iter = 0
260#endif
261
262 call this%send_buf%free()
263 call this%recv_buf%free()
264
265 call this%free_order()
266 call this%free_dofs()
267
268 end subroutine gs_shmem_free
269
275 subroutine gs_shmem_nbsend(this, u, n, tag, deps, strm)
276 class(gs_shmem_t), intent(inout) :: this
277 integer, intent(in) :: n
278 real(kind=rp), dimension(n), intent(inout) :: u
279 integer, intent(in) :: tag
280 type(c_ptr), intent(inout) :: deps
281 type(c_ptr), intent(inout) :: strm
282 integer :: i, j, dst, base
283 integer(c_size_t) :: nbytes
284 integer , pointer :: sp(:)
285 real(kind=rp), pointer :: send_data(:), recv_data(:)
286 integer(c_int64_t), pointer :: data_signals(:), ack_signals(:)
287 real(c_rp) :: rp_dummy
288#ifdef HAVE_OPENSHMEM
289
290 ! Each gs op gets a fresh signal value so receivers can distinguish
291 ! puts from one call vs. the next.
292 this%iter = this%iter + 1
293
294 call c_f_pointer(this%send_buf%buf_ptr, send_data, &
295 [max(this%send_buf%max_total, 1)])
296 call c_f_pointer(this%recv_buf%buf_ptr, recv_data, &
297 [max(this%recv_buf%max_total, 1)])
298 call c_f_pointer(this%data_signals_ptr, data_signals, [pe_size])
299 call c_f_pointer(this%ack_signals_ptr, ack_signals, [pe_size])
300
301 do i = 1, size(this%send_pe)
302 dst = this%send_pe(i)
303
304 ! Wait for dst to have consumed our previous round's data before
305 ! we overwrite their recv buffer. ack_signals[dst] is calloc'd
306 ! to 0 and dst writes the iter value (via atomic_set) after
307 ! consumption, so for iter == 1 the wait succeeds immediately.
308 call shmem_uint64_wait_until(c_loc(ack_signals(dst + 1)), &
309 shmem_cmp_ge, this%iter - 1_8)
310
311 sp => this%send_dof(dst)%array()
312 base = this%send_buf%offset(i)
313 do concurrent(j = 1:this%send_dof(dst)%size())
314 send_data(base + j) = u(sp(j))
315 end do
316
317 nbytes = int(this%send_buf%ndofs(i), c_size_t) * c_sizeof(rp_dummy)
318 ! Put data + signal dst's data_signals[my_rank].
320 c_loc(recv_data(this%send_buf%remote_offset(i) + 1)), &
321 c_loc(send_data(base + 1)), &
322 nbytes, &
323 c_loc(data_signals(pe_rank + 1)), &
324 this%iter, shmem_signal_set, dst)
325 end do
326#endif
327 end subroutine gs_shmem_nbsend
328
330 subroutine gs_shmem_nbrecv(this, tag)
331 class(gs_shmem_t), intent(inout) :: this
332 integer, intent(in) :: tag
333
334 end subroutine gs_shmem_nbrecv
335
339 subroutine gs_shmem_nbwait(this, u, n, op, strm)
340 class(gs_shmem_t), intent(inout) :: this
341 integer, intent(in) :: n
342 real(kind=rp), dimension(n), intent(inout) :: u
343 type(c_ptr), intent(inout) :: strm
344 integer :: op
345 integer :: i, j, src, base
346 integer(c_int64_t) :: dummy
347 integer , pointer :: sp(:)
348 real(kind=rp), pointer :: recv_data(:)
349 integer(c_int64_t), pointer :: data_signals(:), ack_signals(:)
350#ifdef HAVE_OPENSHMEM
351
352 call c_f_pointer(this%recv_buf%buf_ptr, recv_data, &
353 [max(this%recv_buf%max_total, 1)])
354 call c_f_pointer(this%data_signals_ptr, data_signals, [pe_size])
355 call c_f_pointer(this%ack_signals_ptr, ack_signals, [pe_size])
356
357 do i = 1, size(this%recv_pe)
358 src = this%recv_pe(i)
359
360 ! Wait for data from src; data_signals[src] is set by src's put.
361 dummy = shmem_signal_wait_until(c_loc(data_signals(src + 1)), &
362 shmem_cmp_ge, this%iter)
363
364 sp => this%recv_dof(src)%array()
365 base = this%recv_buf%offset(i)
366 select case (op)
367 case (gs_op_add)
368 !NEC$ IVDEP
369 do concurrent(j = 1:this%recv_dof(src)%size())
370 u(sp(j)) = u(sp(j)) + recv_data(base + j)
371 end do
372 case (gs_op_mul)
373 !NEC$ IVDEP
374 do concurrent(j = 1:this%recv_dof(src)%size())
375 u(sp(j)) = u(sp(j)) * recv_data(base + j)
376 end do
377 case (gs_op_min)
378 !NEC$ IVDEP
379 do concurrent(j = 1:this%recv_dof(src)%size())
380 u(sp(j)) = min(u(sp(j)), recv_data(base + j))
381 end do
382 case (gs_op_max)
383 !NEC$ IVDEP
384 do concurrent(j = 1:this%recv_dof(src)%size())
385 u(sp(j)) = max(u(sp(j)), recv_data(base + j))
386 end do
387 case default
388 call neko_error("Unknown operation in gs_nbwait_shmem")
389 end select
390
391 ! Tell src we are done with this round's buffer so they may
392 ! overwrite it on the next round. We set our own rank's slot
393 ! in src's ack_signals; src waits there on the next nbsend.
395 c_loc(ack_signals(pe_rank + 1)), &
396 this%iter, src)
397 end do
398#endif
399 end subroutine gs_shmem_nbwait
400
401end module gs_shmem
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:61
integer, public pe_rank
MPI rank.
Definition comm.F90:58
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:45
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
Defines Gather-scatter operations.
Definition gs_ops.f90:34
integer, parameter, public gs_op_add
Definition gs_ops.f90:36
integer, parameter, public gs_op_max
Definition gs_ops.f90:36
integer, parameter, public gs_op_min
Definition gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition gs_ops.f90:36
Defines OpenSHMEM gather-scatter communication.
Definition gs_shmem.F90:34
subroutine gs_shmem_buf_init(this, pe_order, dof_stack)
Allocate symmetric memory and per-neighbor bookkeeping for one direction of communication.
Definition gs_shmem.F90:122
subroutine gs_shmem_init(this, send_pe, recv_pe)
Initialise OpenSHMEM based communication method.
Definition gs_shmem.F90:185
subroutine gs_shmem_nbsend(this, u, n, tag, deps, strm)
Pack the gathered shared dofs into the symmetric send buffer and issue non-blocking puts with signali...
Definition gs_shmem.F90:276
subroutine gs_shmem_buf_free(this)
Release symmetric memory and bookkeeping.
Definition gs_shmem.F90:164
subroutine gs_shmem_free(this)
Deallocate OpenSHMEM based communication method.
Definition gs_shmem.F90:244
subroutine gs_shmem_nbrecv(this, tag)
No-op: receives are completed via remote put-with-signal.
Definition gs_shmem.F90:331
subroutine gs_shmem_nbwait(this, u, n, op, strm)
Wait per-neighbor for the signal indicating that data has landed, apply the gather-scatter operation ...
Definition gs_shmem.F90:340
integer, parameter, public i8
Definition num_types.f90:7
integer, parameter, public sp
Definition num_types.f90:8
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Fortran bindings to SHMEM's C API.
Definition shmem.F90:34
@ shmem_signal_set
Definition shmem.F90:79
@ shmem_cmp_ge
Definition shmem.F90:50
Implements a dynamic stack ADT.
Definition stack.f90:49
Utilities.
Definition utils.f90:35
Gather-scatter communication method.
Definition gs_comm.f90:47
Symmetric buffer for one direction of OpenSHMEM communication.
Definition gs_shmem.F90:54
Gather-scatter communication using OpenSHMEM one-sided puts with per-rank signaling for completion (O...
Definition gs_shmem.F90:91
Integer based stack.
Definition stack.f90:77
#define max(a, b)
Definition tensor.cu:40