Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gather_scatter.f90
Go to the documentation of this file.
1! Copyright (c) 2020-2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
38 use gs_device, only : gs_device_t
39 use gs_sx, only : gs_sx_t
40 use gs_cpu, only : gs_cpu_t
44 use gs_mpi, only : gs_mpi_t
45 use gs_shmem, only : gs_shmem_t
46 use gs_caf, only : gs_caf_t
50 use mesh, only : mesh_t
51 use comm, only : pe_rank, pe_size, neko_comm
52 use mpi_f08, only : mpi_reduce, mpi_allreduce, mpi_barrier, mpi_in_place, &
53 mpi_wait, mpi_irecv, mpi_isend, mpi_wtime, mpi_sum, mpi_max, &
54 mpi_integer, mpi_integer2, mpi_integer8, mpi_request, mpi_status, &
55 mpi_status_ignore, mpi_get_count
56 use dofmap, only : dofmap_t
57 use field, only : field_t
58 use num_types, only : rp, dp, i2, i8
60 use stack, only : stack_i4_t
61 use utils, only : neko_error, linear_index
62 use logger, only : neko_log, log_size
66 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr
67 implicit none
68 private
69
70 type, public :: gs_t
71 real(kind=rp), allocatable :: local_gs(:)
72 integer, allocatable :: local_dof_gs(:)
73 integer, allocatable :: local_gs_dof(:)
74 integer, allocatable :: local_blk_len(:)
75 integer, allocatable :: local_blk_off(:)
76 real(kind=rp), allocatable :: shared_gs(:)
77 integer, allocatable :: shared_dof_gs(:)
78 integer, allocatable :: shared_gs_dof(:)
79 integer, allocatable :: shared_blk_len(:)
80 integer, allocatable :: shared_blk_off(:)
81 type(dofmap_t), pointer ::dofmap
82 type(htable_i8_t) :: shared_dofs
83 integer :: nlocal
84 integer :: nshared
85 integer :: nlocal_blks
86 integer :: nshared_blks
87 integer :: local_facet_offset
88 integer :: shared_facet_offset
89 class(gs_bcknd_t), allocatable :: bcknd
90 class(gs_comm_t), allocatable :: comm
91 contains
92 procedure, private, pass(gs) :: gs_op_fld
93 procedure, private, pass(gs) :: gs_op_r4
94 procedure, pass(gs) :: gs_op_vector
95 procedure, pass(gs) :: init => gs_init
96 procedure, pass(gs) :: free => gs_free
97 generic :: op => gs_op_fld, gs_op_r4, gs_op_vector
98 end type gs_t
99
100 ! Expose available gather-scatter operation
102
103 ! Expose available gather-scatter backends
105
106 ! Expose available gather-scatter comm. backends
109
110
111contains
112
117 subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
118 class(gs_t), intent(inout) :: gs
119 type(dofmap_t), target, intent(inout) :: dofmap
120 character(len=LOG_SIZE) :: log_buf
121 character(len=20) :: bcknd_str
122 integer, optional :: bcknd, comm_bcknd
123 integer :: i, j, ierr, bcknd_, comm_bcknd_
124 integer(i8) :: glb_nshared, glb_nlocal
125 logical :: use_device_mpi, use_device_nccl, use_device_shmem, use_host_mpi
126 logical :: use_host_shmem
127 logical :: use_caf
128 real(kind=rp), allocatable :: tmp(:)
129 type(c_ptr) :: tmp_d = c_null_ptr
130 integer :: strtgy(4) = [int(b'00'), int(b'01'), int(b'10'), int(b'11')]
131 integer :: avg_strtgy, env_len
132 character(len=255) :: env_strtgy, env_gscomm
133 real(kind=dp) :: strtgy_time(4)
134
135 call gs%free()
136
137 call neko_log%section('Gather-Scatter')
138 ! Currently this uses the dofmap which also contains geometric information
139 ! Only connectivity/numbering of points is technically necessary for gs
140 gs%dofmap => dofmap
141
142 use_device_mpi = .false.
143 use_device_nccl = .false.
144 use_device_shmem = .false.
145 use_host_mpi = .false.
146 use_host_shmem = .false.
147 use_caf = .false.
148
149 ! Check if a comm-backend is requested via env. variables
150 call get_environment_variable("NEKO_GS_COMM", env_gscomm, env_len)
151 if (env_len .gt. 0) then
152 if (env_gscomm(1:env_len) .eq. "MPI") then
153 use_host_mpi = .true.
154 else if (env_gscomm(1:env_len) .eq. "MPIGPU") then
155 use_device_mpi = .true.
156 else if (env_gscomm(1:env_len) .eq. "NCCL") then
157 use_device_nccl = .true.
158 else if (env_gscomm(1:env_len) .eq. "SHMEM") then
159 if (neko_bcknd_device .eq. 1) then
160 use_device_shmem = .true.
161 else
162 use_host_shmem = .true.
163 end if
164 else if (env_gscomm(1:env_len) .eq. "CAF") then
165 use_caf = .true.
166 else
167 call neko_error('Unknown Gather-scatter comm. backend')
168 end if
169 end if
170
171
172 if (present(comm_bcknd)) then
173 comm_bcknd_ = comm_bcknd
174 else if (use_host_mpi) then
175 comm_bcknd_ = gs_comm_mpi
176 else if (use_device_mpi) then
177 comm_bcknd_ = gs_comm_mpigpu
178 else if (use_device_nccl) then
179 comm_bcknd_ = gs_comm_nccl
180 else if (use_device_shmem) then
181 comm_bcknd_ = gs_comm_nvshmem
182 else if (use_host_shmem) then
183 comm_bcknd_ = gs_comm_openshmem
184 else if (use_caf) then
185 comm_bcknd_ = gs_comm_caf
186 else
187 if (neko_device_mpi) then
188 comm_bcknd_ = gs_comm_mpigpu
189 use_device_mpi = .true.
190 else
191 comm_bcknd_ = gs_comm_mpi
192 end if
193 end if
194
195 select case (comm_bcknd_)
196 case (gs_comm_mpi)
197 call neko_log%message('Comm : MPI')
198 allocate(gs_mpi_t::gs%comm)
199 case (gs_comm_mpigpu)
200 call neko_log%message('Comm : Device MPI')
201 allocate(gs_device_mpi_t::gs%comm)
202 case (gs_comm_nccl)
203 call neko_log%message('Comm : NCCL')
204 allocate(gs_device_nccl_t::gs%comm)
205 case (gs_comm_nvshmem)
206 call neko_log%message('Comm : NVSHMEM')
207 allocate(gs_device_shmem_t::gs%comm)
208 case (gs_comm_openshmem)
209 call neko_log%message('Comm : OpenSHMEM')
210 allocate(gs_shmem_t::gs%comm)
211 case (gs_comm_caf)
212 call neko_log%message('Comm : CAF')
213 allocate(gs_caf_t::gs%comm)
214 case default
215 call neko_error('Unknown Gather-scatter comm. backend')
216 end select
217 ! Initialize a stack for each rank containing which dofs to send/recv at
218 ! that rank
219 call gs%comm%init_dofs()
220 ! Initialize mapping between local ids and gather-scatter ids
221 ! based on the global numbering in dofmap
222 call gs_init_mapping(gs)
223 ! Setup buffers and which ranks to send/recv data from based on mapping
224 ! and initializes gs%comm (sets up gs%comm%send_dof and gs%comm%recv_dof and
225 ! recv_pe/send_pe)
226 call gs_schedule(gs)
227 ! Global number of points not needing to be sent over mpi for gs operations
228 ! "Internal points"
229 glb_nlocal = int(gs%nlocal, i8)
230 ! Global number of points needing to be communicated with other pes/ranks
231 ! "external points"
232 glb_nshared = int(gs%nshared, i8)
233 ! Can be thought of a measure of the volume of this rank (glb_nlocal) and
234 ! the surface area (glb_nshared) that is shared with other ranks
235 ! Lots of internal volume compared to surface that needs communication is
236 ! good
237
238 if (pe_rank .eq. 0) then
239 call mpi_reduce(mpi_in_place, glb_nlocal, 1, &
240 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
241
242 call mpi_reduce(mpi_in_place, glb_nshared, 1, &
243 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
244 else
245 call mpi_reduce(glb_nlocal, glb_nlocal, 1, &
246 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
247
248 call mpi_reduce(glb_nshared, glb_nshared, 1, &
249 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
250 end if
251
252 write(log_buf, '(A,I12)') 'Avg. internal: ', glb_nlocal/pe_size
253 call neko_log%message(log_buf)
254 write(log_buf, '(A,I12)') 'Avg. external: ', glb_nshared/pe_size
255 call neko_log%message(log_buf)
256
257 if (present(bcknd)) then
258 bcknd_ = bcknd
259 else
260 if (neko_bcknd_sx .eq. 1) then
261 bcknd_ = gs_bcknd_sx
262 else if (neko_bcknd_device .eq. 1) then
263 bcknd_ = gs_bcknd_dev
264 else
265 bcknd_ = gs_bcknd_cpu
266 end if
267 end if
268
269 ! Setup Gather-scatter backend
270 select case (bcknd_)
271 case (gs_bcknd_cpu)
272 allocate(gs_cpu_t::gs%bcknd)
273 bcknd_str = ' std'
274 case (gs_bcknd_dev)
275 allocate(gs_device_t::gs%bcknd)
276 if (neko_bcknd_hip .eq. 1) then
277 bcknd_str = ' hip'
278 else if (neko_bcknd_cuda .eq. 1) then
279 bcknd_str = ' cuda'
280 else if (neko_bcknd_opencl .eq. 1) then
281 bcknd_str = ' opencl'
282 end if
283 case (gs_bcknd_sx)
284 allocate(gs_sx_t::gs%bcknd)
285 bcknd_str = ' sx'
286 case default
287 call neko_error('Unknown Gather-scatter backend')
288 end select
289
290 write(log_buf, '(A)') 'Backend : ' // trim(bcknd_str)
291 call neko_log%message(log_buf)
292
293
294 call gs%bcknd%init(gs%nlocal, gs%nshared, gs%nlocal_blks, gs%nshared_blks)
295
296 if (use_device_mpi .or. use_device_nccl .or. use_device_shmem) then
297 select type (b => gs%bcknd)
298 type is (gs_device_t)
299 b%shared_on_host = .false.
300 end select
301 end if
302
303 if (use_device_mpi) then
304 if (pe_size .gt. 1) then
305 ! Select fastest device MPI strategy at runtime
306 select type (c => gs%comm)
307 type is (gs_device_mpi_t)
308 call get_environment_variable("NEKO_GS_STRTGY", env_strtgy, &
309 env_len)
310 if (env_len .eq. 0) then
311 allocate(tmp(dofmap%size()))
312 call device_map(tmp, tmp_d, dofmap%size())
313 tmp = 1.0_rp
314 call device_memcpy(tmp, tmp_d, dofmap%size(), &
315 host_to_device, sync = .false.)
316 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
317
318 do i = 1, size(strtgy)
319 c%nb_strtgy = strtgy(i)
320 call device_sync
321 call mpi_barrier(neko_comm)
322 strtgy_time(i) = mpi_wtime()
323 do j = 1, 100
324 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
325 end do
326 strtgy_time(i) = (mpi_wtime() - strtgy_time(i)) / 100d0
327 end do
328
329 call device_deassociate(tmp)
330 call device_free(tmp_d)
331 deallocate(tmp)
332
333 c%nb_strtgy = strtgy(minloc(strtgy_time, 1))
334
335 avg_strtgy = minloc(strtgy_time, 1)
336 call mpi_allreduce(mpi_in_place, avg_strtgy, 1, &
337 mpi_integer, mpi_sum, neko_comm)
338 avg_strtgy = avg_strtgy / pe_size
339
340 write(log_buf, '(A,B0.2,A)') 'Avg. strtgy : [', &
341 strtgy(avg_strtgy), ']'
342
343 else
344 read(env_strtgy(1:env_len), *) i
345
346 if (i .lt. 1 .or. i .gt. 4) then
347 call neko_error('Invalid gs sync strtgy')
348 end if
349
350 c%nb_strtgy = strtgy(i)
351 avg_strtgy = i
352
353 write(log_buf, '(A,B0.2,A)') 'Env. strtgy : [', &
354 strtgy(avg_strtgy), ']'
355 end if
356
357 call neko_log%message(log_buf)
358
359 end select
360 end if
361 end if
362
363 call neko_log%end_section()
364
365 end subroutine gs_init
366
368 subroutine gs_free(gs)
369 class(gs_t), intent(inout) :: gs
370
371 nullify(gs%dofmap)
372
373 if (allocated(gs%local_gs)) then
374 deallocate(gs%local_gs)
375 end if
376
377 if (allocated(gs%local_dof_gs)) then
378 deallocate(gs%local_dof_gs)
379 end if
380
381 if (allocated(gs%local_gs_dof)) then
382 deallocate(gs%local_gs_dof)
383 end if
384
385 if (allocated(gs%local_blk_len)) then
386 deallocate(gs%local_blk_len)
387 end if
388
389 if (allocated(gs%local_blk_off)) then
390 deallocate(gs%local_blk_off)
391 end if
392
393 if (allocated(gs%shared_gs)) then
394 deallocate(gs%shared_gs)
395 end if
396
397 if (allocated(gs%shared_dof_gs)) then
398 deallocate(gs%shared_dof_gs)
399 end if
400
401 if (allocated(gs%shared_gs_dof)) then
402 deallocate(gs%shared_gs_dof)
403 end if
404
405 if (allocated(gs%shared_blk_len)) then
406 deallocate(gs%shared_blk_len)
407 end if
408
409 if (allocated(gs%shared_blk_off)) then
410 deallocate(gs%shared_blk_off)
411 end if
412
413 gs%nlocal = 0
414 gs%nshared = 0
415 gs%nlocal_blks = 0
416 gs%nshared_blks = 0
417
418 call gs%shared_dofs%free()
419
420 if (allocated(gs%bcknd)) then
421 call gs%bcknd%free()
422 deallocate(gs%bcknd)
423 end if
424
425 if (allocated(gs%comm)) then
426 call gs%comm%free()
427 deallocate(gs%comm)
428 end if
429
430 end subroutine gs_free
431
433 subroutine gs_init_mapping(gs)
434 type(gs_t), target, intent(inout) :: gs
435 type(mesh_t), pointer :: msh
436 type(dofmap_t), pointer :: dofmap
437 type(stack_i4_t), target :: local_dof, dof_local, shared_dof, dof_shared
438 type(stack_i4_t), target :: local_face_dof, face_dof_local
439 type(stack_i4_t), target :: shared_face_dof, face_dof_shared
440 integer :: i, j, k, l, lx, ly, lz, max_id, max_sid, id, lid, dm_size
441 type(htable_i8_t) :: dm
442 type(htable_i8_t), pointer :: sdm
443
444 dofmap => gs%dofmap
445 msh => dofmap%msh
446 sdm => gs%shared_dofs
447
448 lx = dofmap%Xh%lx
449 ly = dofmap%Xh%ly
450 lz = dofmap%Xh%lz
451 dm_size = dofmap%size()/lx
452
453 call dm%init(dm_size, i)
457 call sdm%init(dofmap%size(), i)
458
459
460 call local_dof%init()
461 call dof_local%init()
462
463 call local_face_dof%init()
464 call face_dof_local%init()
465
466 call shared_dof%init()
467 call dof_shared%init()
468
469 call shared_face_dof%init()
470 call face_dof_shared%init()
471
472 !
473 ! Setup mapping for dofs points
474 !
475
476 max_id = 0
477 max_sid = 0
478 do i = 1, msh%nelv
479 ! Local id of vertices
480 lid = linear_index(1, 1, 1, i, lx, ly, lz)
481 ! Check if this dof is shared among ranks or not
482 if (dofmap%shared_dof(1, 1, 1, i)) then
483 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, 1, i), max_sid)
484 !If add unique gather-scatter id to shared_dof stack
485 call shared_dof%push(id)
486 !If add local id to dof_shared stack
487 call dof_shared%push(lid)
488 !Now we have the mapping of local id <-> gather scatter id!
489 else
490 ! Same here, only here we know the point is local
491 ! It will as such not need to be sent to other ranks later
492 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, 1, i), max_id)
493 call local_dof%push(id)
494 call dof_local%push(lid)
495 end if
496 ! This procedure is then repeated for all vertices and edges
497 ! Facets can be treated a little bit differently since they only have one
498 ! neighbor
499
500 lid = linear_index(lx, 1, 1, i, lx, ly, lz)
501 if (dofmap%shared_dof(lx, 1, 1, i)) then
502 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, 1, i), max_sid)
503 call shared_dof%push(id)
504 call dof_shared%push(lid)
505 else
506 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, 1, i), max_id)
507 call local_dof%push(id)
508 call dof_local%push(lid)
509 end if
510
511 lid = linear_index(1, ly, 1, i, lx, ly, lz)
512 if (dofmap%shared_dof(1, ly, 1, i)) then
513 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, 1, i), max_sid)
514 call shared_dof%push(id)
515 call dof_shared%push(lid)
516 else
517 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, 1, i), max_id)
518 call local_dof%push(id)
519 call dof_local%push(lid)
520 end if
521
522 lid = linear_index(lx, ly, 1, i, lx, ly, lz)
523 if (dofmap%shared_dof(lx, ly, 1, i)) then
524 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, 1, i), max_sid)
525 call shared_dof%push(id)
526 call dof_shared%push(lid)
527 else
528 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, 1, i), max_id)
529 call local_dof%push(id)
530 call dof_local%push(lid)
531 end if
532 if (lz .gt. 1) then
533 lid = linear_index(1, 1, lz, i, lx, ly, lz)
534 if (dofmap%shared_dof(1, 1, lz, i)) then
535 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, lz, i), max_sid)
536 call shared_dof%push(id)
537 call dof_shared%push(lid)
538 else
539 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, lz, i), max_id)
540 call local_dof%push(id)
541 call dof_local%push(lid)
542 end if
543
544 lid = linear_index(lx, 1, lz, i, lx, ly, lz)
545 if (dofmap%shared_dof(lx, 1, lz, i)) then
546 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, lz, i), max_sid)
547 call shared_dof%push(id)
548 call dof_shared%push(lid)
549 else
550 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, lz, i), max_id)
551 call local_dof%push(id)
552 call dof_local%push(lid)
553 end if
554
555 lid = linear_index(1, ly, lz, i, lx, ly, lz)
556 if (dofmap%shared_dof(1, ly, lz, i)) then
557 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, lz, i), max_sid)
558 call shared_dof%push(id)
559 call dof_shared%push(lid)
560 else
561 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, lz, i), max_id)
562 call local_dof%push(id)
563 call dof_local%push(lid)
564 end if
565
566 lid = linear_index(lx, ly, lz, i, lx, ly, lz)
567 if (dofmap%shared_dof(lx, ly, lz, i)) then
568 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, lz, i), max_sid)
569 call shared_dof%push(id)
570 call dof_shared%push(lid)
571 else
572 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, lz, i), max_id)
573 call local_dof%push(id)
574 call dof_local%push(lid)
575 end if
576 end if
577 end do
578
579 ! Clear local dofmap table
580 call dm%clear()
581 ! Get gather scatter ids and local ids of edges
582 if (lz .gt. 1) then
583 !
584 ! Setup mapping for dofs on edges
585 !
586 do i = 1, msh%nelv
587
588 !
589 ! dofs on edges in x-direction
590 !
591 if (dofmap%shared_dof(2, 1, 1, i)) then
592 do j = 2, lx - 1
593 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
594 call shared_dof%push(id)
595 id = linear_index(j, 1, 1, i, lx, ly, lz)
596 call dof_shared%push(id)
597 end do
598 else
599 do j = 2, lx - 1
600 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
601 call local_dof%push(id)
602 id = linear_index(j, 1, 1, i, lx, ly, lz)
603 call dof_local%push(id)
604 end do
605 end if
606 if (dofmap%shared_dof(2, 1, lz, i)) then
607 do j = 2, lx - 1
608 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, lz, i), max_sid)
609 call shared_dof%push(id)
610 id = linear_index(j, 1, lz, i, lx, ly, lz)
611 call dof_shared%push(id)
612 end do
613 else
614 do j = 2, lx - 1
615 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, lz, i), max_id)
616 call local_dof%push(id)
617 id = linear_index(j, 1, lz, i, lx, ly, lz)
618 call dof_local%push(id)
619 end do
620 end if
621
622 if (dofmap%shared_dof(2, ly, 1, i)) then
623 do j = 2, lx - 1
624 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
625 call shared_dof%push(id)
626 id = linear_index(j, ly, 1, i, lx, ly, lz)
627 call dof_shared%push(id)
628 end do
629
630 else
631 do j = 2, lx - 1
632 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
633 call local_dof%push(id)
634 id = linear_index(j, ly, 1, i, lx, ly, lz)
635 call dof_local%push(id)
636 end do
637 end if
638 if (dofmap%shared_dof(2, ly, lz, i)) then
639 do j = 2, lx - 1
640 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, lz, i), max_sid)
641 call shared_dof%push(id)
642 id = linear_index(j, ly, lz, i, lx, ly, lz)
643 call dof_shared%push(id)
644 end do
645 else
646 do j = 2, lx - 1
647 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, lz, i), max_id)
648 call local_dof%push(id)
649 id = linear_index(j, ly, lz, i, lx, ly, lz)
650 call dof_local%push(id)
651 end do
652 end if
653
654 !
655 ! dofs on edges in y-direction
656 !
657 if (dofmap%shared_dof(1, 2, 1, i)) then
658 do k = 2, ly - 1
659 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
660 call shared_dof%push(id)
661 id = linear_index(1, k, 1, i, lx, ly, lz)
662 call dof_shared%push(id)
663 end do
664 else
665 do k = 2, ly - 1
666 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
667 call local_dof%push(id)
668 id = linear_index(1, k, 1, i, lx, ly, lz)
669 call dof_local%push(id)
670 end do
671 end if
672 if (dofmap%shared_dof(1, 2, lz, i)) then
673 do k = 2, ly - 1
674 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, lz, i), max_sid)
675 call shared_dof%push(id)
676 id = linear_index(1, k, lz, i, lx, ly, lz)
677 call dof_shared%push(id)
678 end do
679 else
680 do k = 2, ly - 1
681 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, lz, i), max_id)
682 call local_dof%push(id)
683 id = linear_index(1, k, lz, i, lx, ly, lz)
684 call dof_local%push(id)
685 end do
686 end if
687
688 if (dofmap%shared_dof(lx, 2, 1, i)) then
689 do k = 2, ly - 1
690 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
691 call shared_dof%push(id)
692 id = linear_index(lx, k, 1, i, lx, ly, lz)
693 call dof_shared%push(id)
694 end do
695 else
696 do k = 2, ly - 1
697 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
698 call local_dof%push(id)
699 id = linear_index(lx, k, 1, i, lx, ly, lz)
700 call dof_local%push(id)
701 end do
702 end if
703 if (dofmap%shared_dof(lx, 2, lz, i)) then
704 do k = 2, ly - 1
705 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, lz, i), max_sid)
706 call shared_dof%push(id)
707 id = linear_index(lx, k, lz, i, lx, ly, lz)
708 call dof_shared%push(id)
709 end do
710 else
711 do k = 2, ly - 1
712 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, lz, i), max_id)
713 call local_dof%push(id)
714 id = linear_index(lx, k, lz, i, lx, ly, lz)
715 call dof_local%push(id)
716 end do
717 end if
718 !
719 ! dofs on edges in z-direction
720 !
721 if (dofmap%shared_dof(1, 1, 2, i)) then
722 do l = 2, lz - 1
723 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, l, i), max_sid)
724 call shared_dof%push(id)
725 id = linear_index(1, 1, l, i, lx, ly, lz)
726 call dof_shared%push(id)
727 end do
728 else
729 do l = 2, lz - 1
730 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, l, i), max_id)
731 call local_dof%push(id)
732 id = linear_index(1, 1, l, i, lx, ly, lz)
733 call dof_local%push(id)
734 end do
735 end if
736
737 if (dofmap%shared_dof(lx, 1, 2, i)) then
738 do l = 2, lz - 1
739 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, l, i), max_sid)
740 call shared_dof%push(id)
741 id = linear_index(lx, 1, l, i, lx, ly, lz)
742 call dof_shared%push(id)
743 end do
744 else
745 do l = 2, lz - 1
746 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, l, i), max_id)
747 call local_dof%push(id)
748 id = linear_index(lx, 1, l, i, lx, ly, lz)
749 call dof_local%push(id)
750 end do
751 end if
752
753 if (dofmap%shared_dof(1, ly, 2, i)) then
754 do l = 2, lz - 1
755 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, l, i), max_sid)
756 call shared_dof%push(id)
757 id = linear_index(1, ly, l, i, lx, ly, lz)
758 call dof_shared%push(id)
759 end do
760 else
761 do l = 2, lz - 1
762 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, l, i), max_id)
763 call local_dof%push(id)
764 id = linear_index(1, ly, l, i, lx, ly, lz)
765 call dof_local%push(id)
766 end do
767 end if
768
769 if (dofmap%shared_dof(lx, ly, 2, i)) then
770 do l = 2, lz - 1
771 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, l, i), max_sid)
772 call shared_dof%push(id)
773 id = linear_index(lx, ly, l, i, lx, ly, lz)
774 call dof_shared%push(id)
775 end do
776 else
777 do l = 2, lz - 1
778 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, l, i), max_id)
779 call local_dof%push(id)
780 id = linear_index(lx, ly, l, i, lx, ly, lz)
781 call dof_local%push(id)
782 end do
783 end if
784 end do
785 end if
786
787 ! Clear local dofmap table
788 call dm%clear()
789
790 !
791 ! Setup mapping for dofs on facets
792 !
793 ! This is for 2d
794 if (lz .eq. 1) then
795 do i = 1, msh%nelv
796
797 !
798 ! dofs on edges in x-direction
799 !
800 if (msh%facet_neigh(3, i) .ne. 0) then
801 if (dofmap%shared_dof(2, 1, 1, i)) then
802 do j = 2, lx - 1
803 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
804 call shared_face_dof%push(id)
805 id = linear_index(j, 1, 1, i, lx, ly, lz)
806 call face_dof_shared%push(id)
807 end do
808 else
809 do j = 2, lx - 1
810 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
811 call local_face_dof%push(id)
812 id = linear_index(j, 1, 1, i, lx, ly, lz)
813 call face_dof_local%push(id)
814 end do
815 end if
816 end if
817
818 if (msh%facet_neigh(4, i) .ne. 0) then
819 if (dofmap%shared_dof(2, ly, 1, i)) then
820 do j = 2, lx - 1
821 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), &
822 max_sid)
823 call shared_face_dof%push(id)
824 id = linear_index(j, ly, 1, i, lx, ly, lz)
825 call face_dof_shared%push(id)
826 end do
827
828 else
829 do j = 2, lx - 1
830 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), &
831 max_id)
832 call local_face_dof%push(id)
833 id = linear_index(j, ly, 1, i, lx, ly, lz)
834 call face_dof_local%push(id)
835 end do
836 end if
837 end if
838
839 !
840 ! dofs on edges in y-direction
841 !
842 if (msh%facet_neigh(1, i) .ne. 0) then
843 if (dofmap%shared_dof(1, 2, 1, i)) then
844 do k = 2, ly - 1
845 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
846 call shared_face_dof%push(id)
847 id = linear_index(1, k, 1, i, lx, ly, lz)
848 call face_dof_shared%push(id)
849 end do
850 else
851 do k = 2, ly - 1
852 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
853 call local_face_dof%push(id)
854 id = linear_index(1, k, 1, i, lx, ly, lz)
855 call face_dof_local%push(id)
856 end do
857 end if
858 end if
859
860 if (msh%facet_neigh(2, i) .ne. 0) then
861 if (dofmap%shared_dof(lx, 2, 1, i)) then
862 do k = 2, ly - 1
863 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), &
864 max_sid)
865 call shared_face_dof%push(id)
866 id = linear_index(lx, k, 1, i, lx, ly, lz)
867 call face_dof_shared%push(id)
868 end do
869 else
870 do k = 2, ly - 1
871 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), &
872 max_id)
873 call local_face_dof%push(id)
874 id = linear_index(lx, k, 1, i, lx, ly, lz)
875 call face_dof_local%push(id)
876 end do
877 end if
878 end if
879 end do
880 else
881 do i = 1, msh%nelv
882
883 ! Facets in x-direction (s, t)-plane
884 if (msh%facet_neigh(1, i) .ne. 0) then
885 if (dofmap%shared_dof(1, 2, 2, i)) then
886 do l = 2, lz - 1
887 do k = 2, ly - 1
888 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, l, i), &
889 max_sid)
890 call shared_face_dof%push(id)
891 id = linear_index(1, k, l, i, lx, ly, lz)
892 call face_dof_shared%push(id)
893 end do
894 end do
895 else
896 do l = 2, lz - 1
897 do k = 2, ly - 1
898 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, l, i), &
899 max_id)
900 call local_face_dof%push(id)
901 id = linear_index(1, k, l, i, lx, ly, lz)
902 call face_dof_local%push(id)
903 end do
904 end do
905 end if
906 end if
907
908 if (msh%facet_neigh(2, i) .ne. 0) then
909 if (dofmap%shared_dof(lx, 2, 2, i)) then
910 do l = 2, lz - 1
911 do k = 2, ly - 1
912 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, l, i), &
913 max_sid)
914 call shared_face_dof%push(id)
915 id = linear_index(lx, k, l, i, lx, ly, lz)
916 call face_dof_shared%push(id)
917 end do
918 end do
919 else
920 do l = 2, lz - 1
921 do k = 2, ly - 1
922 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, l, i), &
923 max_id)
924 call local_face_dof%push(id)
925 id = linear_index(lx, k, l, i, lx, ly, lz)
926 call face_dof_local%push(id)
927 end do
928 end do
929 end if
930 end if
931
932 ! Facets in y-direction (r, t)-plane
933 if (msh%facet_neigh(3, i) .ne. 0) then
934 if (dofmap%shared_dof(2, 1, 2, i)) then
935 do l = 2, lz - 1
936 do j = 2, lx - 1
937 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, l, i), &
938 max_sid)
939 call shared_face_dof%push(id)
940 id = linear_index(j, 1, l, i, lx, ly, lz)
941 call face_dof_shared%push(id)
942 end do
943 end do
944 else
945 do l = 2, lz - 1
946 do j = 2, lx - 1
947 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, l, i), &
948 max_id)
949 call local_face_dof%push(id)
950 id = linear_index(j, 1, l, i, lx, ly, lz)
951 call face_dof_local%push(id)
952 end do
953 end do
954 end if
955 end if
956
957 if (msh%facet_neigh(4, i) .ne. 0) then
958 if (dofmap%shared_dof(2, ly, 2, i)) then
959 do l = 2, lz - 1
960 do j = 2, lx - 1
961 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, l, i), &
962 max_sid)
963 call shared_face_dof%push(id)
964 id = linear_index(j, ly, l, i, lx, ly, lz)
965 call face_dof_shared%push(id)
966 end do
967 end do
968 else
969 do l = 2, lz - 1
970 do j = 2, lx - 1
971 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, l, i), &
972 max_id)
973 call local_face_dof%push(id)
974 id = linear_index(j, ly, l, i, lx, ly, lz)
975 call face_dof_local%push(id)
976 end do
977 end do
978 end if
979 end if
980
981 ! Facets in z-direction (r, s)-plane
982 if (msh%facet_neigh(5, i) .ne. 0) then
983 if (dofmap%shared_dof(2, 2, 1, i)) then
984 do k = 2, ly - 1
985 do j = 2, lx - 1
986 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, 1, i), &
987 max_sid)
988 call shared_face_dof%push(id)
989 id = linear_index(j, k, 1, i, lx, ly, lz)
990 call face_dof_shared%push(id)
991 end do
992 end do
993 else
994 do k = 2, ly - 1
995 do j = 2, lx - 1
996 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, 1, i), &
997 max_id)
998 call local_face_dof%push(id)
999 id = linear_index(j, k, 1, i, lx, ly, lz)
1000 call face_dof_local%push(id)
1001 end do
1002 end do
1003 end if
1004 end if
1005
1006 if (msh%facet_neigh(6, i) .ne. 0) then
1007 if (dofmap%shared_dof(2, 2, lz, i)) then
1008 do k = 2, ly - 1
1009 do j = 2, lx - 1
1010 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, lz, i), &
1011 max_sid)
1012 call shared_face_dof%push(id)
1013 id = linear_index(j, k, lz, i, lx, ly, lz)
1014 call face_dof_shared%push(id)
1015 end do
1016 end do
1017 else
1018 do k = 2, ly - 1
1019 do j = 2, lx - 1
1020 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, lz, i), &
1021 max_id)
1022 call local_face_dof%push(id)
1023 id = linear_index(j, k, lz, i, lx, ly, lz)
1024 call face_dof_local%push(id)
1025 end do
1026 end do
1027 end if
1028 end if
1029 end do
1030 end if
1031
1032
1033 call dm%free()
1034
1035 gs%nlocal = local_dof%size() + local_face_dof%size()
1036 gs%local_facet_offset = local_dof%size() + 1
1037
1038 ! Finalize local dof to gather-scatter index
1039 allocate(gs%local_dof_gs(gs%nlocal))
1040
1041 ! Add dofs on points and edges
1042
1043 ! We should use the %array() procedure, which works great for
1044 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1045 ! certain data types
1046 select type (dof_array => local_dof%data)
1047 type is (integer)
1048 j = local_dof%size()
1049 do i = 1, j
1050 gs%local_dof_gs(i) = dof_array(i)
1051 end do
1052 end select
1053 call local_dof%free()
1054
1055 ! Add dofs on faces
1056
1057 ! We should use the %array() procedure, which works great for
1058 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1059 ! certain data types
1060 select type (dof_array => local_face_dof%data)
1061 type is (integer)
1062 do i = 1, local_face_dof%size()
1063 gs%local_dof_gs(i + j) = dof_array(i)
1064 end do
1065 end select
1066 call local_face_dof%free()
1067
1068 ! Finalize local gather-scatter index to dof
1069 allocate(gs%local_gs_dof(gs%nlocal))
1070
1071 ! Add gather-scatter index on points and edges
1072
1073 ! We should use the %array() procedure, which works great for
1074 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1075 ! certain data types
1076 select type (dof_array => dof_local%data)
1077 type is (integer)
1078 j = dof_local%size()
1079 do i = 1, j
1080 gs%local_gs_dof(i) = dof_array(i)
1081 end do
1082 end select
1083 call dof_local%free()
1084
1085 ! We should use the %array() procedure, which works great for
1086 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1087 ! certain data types
1088 select type (dof_array => face_dof_local%data)
1089 type is (integer)
1090 do i = 1, face_dof_local%size()
1091 gs%local_gs_dof(i+j) = dof_array(i)
1092 end do
1093 end select
1094 call face_dof_local%free()
1095
1096 call gs_qsort_dofmap(gs%local_dof_gs, gs%local_gs_dof, &
1097 gs%nlocal, 1, gs%nlocal)
1098
1099 call gs_find_blks(gs%local_dof_gs, gs%local_blk_len, &
1100 gs%local_blk_off, gs%nlocal_blks, gs%nlocal, gs%local_facet_offset)
1101
1102 ! Allocate buffer for local gs-ops
1103 allocate(gs%local_gs(gs%nlocal))
1104
1105 gs%nshared = shared_dof%size() + shared_face_dof%size()
1106 gs%shared_facet_offset = shared_dof%size() + 1
1107
1108 ! Finalize shared dof to gather-scatter index
1109 allocate(gs%shared_dof_gs(gs%nshared))
1110
1111 ! Add shared dofs on points and edges
1112
1113 ! We should use the %array() procedure, which works great for
1114 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1115 ! certain data types
1116 select type (dof_array => shared_dof%data)
1117 type is (integer)
1118 j = shared_dof%size()
1119 do i = 1, j
1120 gs%shared_dof_gs(i) = dof_array(i)
1121 end do
1122 end select
1123 call shared_dof%free()
1124
1125 ! Add shared dofs on faces
1126
1127 ! We should use the %array() procedure, which works great for
1128 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1129 ! certain data types
1130 select type (dof_array => shared_face_dof%data)
1131 type is (integer)
1132 do i = 1, shared_face_dof%size()
1133 gs%shared_dof_gs(i + j) = dof_array(i)
1134 end do
1135 end select
1136 call shared_face_dof%free()
1137
1138 ! Finalize shared gather-scatter index to dof
1139 allocate(gs%shared_gs_dof(gs%nshared))
1140
1141 ! Add dofs on points and edges
1142
1143 ! We should use the %array() procedure, which works great for
1144 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1145 ! certain data types
1146 select type (dof_array => dof_shared%data)
1147 type is (integer)
1148 j = dof_shared%size()
1149 do i = 1, j
1150 gs%shared_gs_dof(i) = dof_array(i)
1151 end do
1152 end select
1153 call dof_shared%free()
1154
1155 ! We should use the %array() procedure, which works great for
1156 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1157 ! certain data types
1158 select type (dof_array => face_dof_shared%data)
1159 type is (integer)
1160 do i = 1, face_dof_shared%size()
1161 gs%shared_gs_dof(i + j) = dof_array(i)
1162 end do
1163 end select
1164 call face_dof_shared%free()
1165
1166 ! Allocate buffer for shared gs-ops
1167 allocate(gs%shared_gs(gs%nshared))
1168
1169 if (gs%nshared .gt. 0) then
1170 call gs_qsort_dofmap(gs%shared_dof_gs, gs%shared_gs_dof, &
1171 gs%nshared, 1, gs%nshared)
1172
1173 call gs_find_blks(gs%shared_dof_gs, gs%shared_blk_len, &
1174 gs%shared_blk_off, gs%nshared_blks, gs%nshared, &
1175 gs%shared_facet_offset)
1176 end if
1177
1178 contains
1179
1188 function gs_mapping_add_dof(map_, dof, max_id) result(id)
1189 type(htable_i8_t), intent(inout) :: map_
1190 integer(kind=i8), intent(inout) :: dof
1191 integer, intent(inout) :: max_id
1192 integer :: id
1193
1194 if (map_%get(dof, id) .gt. 0) then
1195 max_id = max_id + 1
1196 call map_%set(dof, max_id)
1197 id = max_id
1198 end if
1199
1200 end function gs_mapping_add_dof
1201
1203 recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
1204 integer, intent(inout) :: n
1205 integer, dimension(n), intent(inout) :: dg
1206 integer, dimension(n), intent(inout) :: gd
1207 integer :: lo, hi
1208 integer :: tmp, i, j, pivot
1209
1210 i = lo - 1
1211 j = hi + 1
1212 pivot = dg((lo + hi) / 2)
1213 do
1214 do
1215 i = i + 1
1216 if (dg(i) .ge. pivot) exit
1217 end do
1218
1219 do
1220 j = j - 1
1221 if (dg(j) .le. pivot) exit
1222 end do
1223
1224 if (i .lt. j) then
1225 tmp = dg(i)
1226 dg(i) = dg(j)
1227 dg(j) = tmp
1228
1229 tmp = gd(i)
1230 gd(i) = gd(j)
1231 gd(j) = tmp
1232 else if (i .eq. j) then
1233 i = i + 1
1234 exit
1235 else
1236 exit
1237 end if
1238 end do
1239 if (lo .lt. j) call gs_qsort_dofmap(dg, gd, n, lo, j)
1240 if (i .lt. hi) call gs_qsort_dofmap(dg, gd, n, i, hi)
1241
1242 end subroutine gs_qsort_dofmap
1243
1245 subroutine gs_find_blks(dg, blk_len, blk_off, nblks, n, m)
1246 integer, intent(in) :: n
1247 integer, intent(in) :: m
1248 integer, dimension(n), intent(inout) :: dg
1249 integer, allocatable, intent(inout) :: blk_len(:)
1250 integer, allocatable, intent(inout) :: blk_off(:)
1251 integer, intent(inout) :: nblks
1252 integer :: i, j
1253 integer :: id, count
1254 type(stack_i4_t), target :: blks
1255
1256 call blks%init()
1257 i = 1
1258 do while (i .lt. m)
1259 id = dg(i)
1260 count = 1
1261 j = i
1262 do while ( j+1 .le. n .and. dg(j+1) .eq. id)
1263 j = j + 1
1264 count = count + 1
1265 end do
1266 call blks%push(count)
1267 i = j + 1
1268 end do
1269
1270 select type (blk_array => blks%data)
1271 type is (integer)
1272 nblks = blks%size()
1273 allocate(blk_len(nblks))
1274 do i = 1, nblks
1275 blk_len(i) = blk_array(i)
1276 end do
1277 allocate(blk_off(nblks))
1278 blk_off(1) = 0
1279 do i = 2, nblks
1280 blk_off(i) = blk_off(i - 1) + blk_len(i - 1)
1281 end do
1282 end select
1283 call blks%free()
1284
1285 end subroutine gs_find_blks
1286
1287 end subroutine gs_init_mapping
1288
1290 subroutine gs_schedule(gs)
1291 type(gs_t), target, intent(inout) :: gs
1292 integer(kind=i8), allocatable :: send_buf(:), recv_buf(:)
1293 integer(kind=i2), allocatable :: shared_flg(:), recv_flg(:)
1294 type(htable_iter_i8_t) :: it
1295 type(stack_i4_t) :: send_pe, recv_pe
1296 type(mpi_status) :: status
1297 type(mpi_request) :: send_req, recv_req
1298 integer :: i, j, max_recv, src, dst, ierr, n_recv
1299 integer :: tmp, shared_gs_id
1300 integer :: nshared_unique
1301
1302 nshared_unique = gs%shared_dofs%num_entries()
1303
1304 call it%init(gs%shared_dofs)
1305 allocate(send_buf(nshared_unique))
1306 i = 1
1307 do while (it%next())
1308 send_buf(i) = it%key()
1309 i = i + 1
1310 end do
1311
1312 call send_pe%init()
1313 call recv_pe%init()
1314
1315
1316 !
1317 ! Schedule exchange of shared dofs
1318 !
1319
1320 call mpi_allreduce(nshared_unique, max_recv, 1, &
1321 mpi_integer, mpi_max, neko_comm, ierr)
1322
1323 allocate(recv_buf(max_recv))
1324 allocate(shared_flg(max_recv))
1325 allocate(recv_flg(max_recv))
1326
1328 do i = 1, size(gs%dofmap%msh%neigh_order)
1329 src = modulo(pe_rank - gs%dofmap%msh%neigh_order(i) + pe_size, pe_size)
1330 dst = modulo(pe_rank + gs%dofmap%msh%neigh_order(i), pe_size)
1331
1332 if (gs%dofmap%msh%neigh(src)) then
1333 call mpi_irecv(recv_buf, max_recv, mpi_integer8, &
1334 src, 0, neko_comm, recv_req, ierr)
1335 end if
1336
1337 if (gs%dofmap%msh%neigh(dst)) then
1338 call mpi_isend(send_buf, nshared_unique, mpi_integer8, &
1339 dst, 0, neko_comm, send_req, ierr)
1340 end if
1341
1342 if (gs%dofmap%msh%neigh(src)) then
1343 call mpi_wait(recv_req, status, ierr)
1344 call mpi_get_count(status, mpi_integer8, n_recv, ierr)
1345
1346 do j = 1, n_recv
1347 shared_flg(j) = gs%shared_dofs%get(recv_buf(j), shared_gs_id)
1348 if (shared_flg(j) .eq. 0) then
1350 call gs%comm%recv_dof(src)%push(shared_gs_id)
1351 end if
1352 end do
1353
1354 if (gs%comm%recv_dof(src)%size() .gt. 0) then
1355 call recv_pe%push(src)
1356 end if
1357 end if
1358
1359 if (gs%dofmap%msh%neigh(dst)) then
1360 call mpi_wait(send_req, mpi_status_ignore, ierr)
1361 call mpi_irecv(recv_flg, max_recv, mpi_integer2, &
1362 dst, 0, neko_comm, recv_req, ierr)
1363 end if
1364
1365 if (gs%dofmap%msh%neigh(src)) then
1366 call mpi_isend(shared_flg, n_recv, mpi_integer2, &
1367 src, 0, neko_comm, send_req, ierr)
1368 end if
1369
1370 if (gs%dofmap%msh%neigh(dst)) then
1371 call mpi_wait(recv_req, status, ierr)
1372 call mpi_get_count(status, mpi_integer2, n_recv, ierr)
1373
1374 do j = 1, n_recv
1375 if (recv_flg(j) .eq. 0) then
1376 tmp = gs%shared_dofs%get(send_buf(j), shared_gs_id)
1378 call gs%comm%send_dof(dst)%push(shared_gs_id)
1379 end if
1380 end do
1381
1382 if (gs%comm%send_dof(dst)%size() .gt. 0) then
1383 call send_pe%push(dst)
1384 end if
1385 end if
1386
1387 if (gs%dofmap%msh%neigh(src)) then
1388 call mpi_wait(send_req, mpi_status_ignore, ierr)
1389 end if
1390
1391 end do
1392
1393 call gs%comm%init(send_pe, recv_pe)
1394
1395 call send_pe%free()
1396 call recv_pe%free()
1397
1398 deallocate(send_buf)
1399 deallocate(recv_flg)
1400 deallocate(shared_flg)
1401 !This arrays seems to take massive amounts of memory...
1402 call gs%shared_dofs%free()
1403
1404 end subroutine gs_schedule
1405
1407 subroutine gs_op_fld(gs, u, op, event)
1408 class(gs_t), intent(inout) :: gs
1409 type(field_t), intent(inout) :: u
1410 type(c_ptr), optional, intent(inout) :: event
1411 integer :: n, op
1412
1413 n = u%msh%nelv * u%Xh%lx * u%Xh%ly * u%Xh%lz
1414 if (present(event)) then
1415 call gs_op_vector(gs, u%x, n, op, event)
1416 else
1417 call gs_op_vector(gs, u%x, n, op)
1418 end if
1419
1420 end subroutine gs_op_fld
1421
1423 subroutine gs_op_r4(gs, u, n, op, event)
1424 class(gs_t), intent(inout) :: gs
1425 integer, intent(in) :: n
1426 real(kind=rp), contiguous, dimension(:,:,:,:), intent(inout) :: u
1427 type(c_ptr), optional, intent(inout) :: event
1428 integer :: op
1429
1430 if (present(event)) then
1431 call gs_op_vector(gs, u, n, op, event)
1432 else
1433 call gs_op_vector(gs, u, n, op)
1434 end if
1435
1436 end subroutine gs_op_r4
1437
1439 subroutine gs_op_vector(gs, u, n, op, event)
1440 class(gs_t), intent(inout) :: gs
1441 integer, intent(in) :: n
1442 real(kind=rp), dimension(n), intent(inout) :: u
1443 type(c_ptr), optional, intent(inout) :: event
1444 integer :: m, l, op, lo, so
1445
1446 lo = gs%local_facet_offset
1447 so = -gs%shared_facet_offset
1448 m = gs%nlocal
1449 l = gs%nshared
1450
1451 call profiler_start_region("gather_scatter", 5)
1452 ! Gather shared dofs
1453 if (pe_size .gt. 1 .and. n .gt. 0) then
1454 call profiler_start_region("gs_nbrecv", 13)
1455 call gs%comm%nbrecv()
1456 call profiler_end_region("gs_nbrecv", 13)
1457 call profiler_start_region("gs_gather_shared", 14)
1458 call gs%bcknd%gather(gs%shared_gs, l, so, gs%shared_dof_gs, u, n, &
1459 gs%shared_gs_dof, gs%nshared_blks, gs%shared_blk_len, &
1460 gs%shared_blk_off, op, .true.)
1461 call profiler_end_region("gs_gather_shared", 14)
1462 call profiler_start_region("gs_nbsend", 6)
1463 call gs%comm%nbsend(gs%shared_gs, l, &
1464 gs%bcknd%gather_event, gs%bcknd%gs_stream)
1465 call profiler_end_region("gs_nbsend", 6)
1466
1467 end if
1468
1469 ! Gather-scatter local dofs
1470 call profiler_start_region("gs_local", 12)
1471 call gs%bcknd%gather(gs%local_gs, m, lo, gs%local_dof_gs, u, n, &
1472 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, gs%local_blk_off, &
1473 op, .false.)
1474 call gs%bcknd%scatter(gs%local_gs, m, gs%local_dof_gs, u, n, &
1475 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, gs%local_blk_off, &
1476 .false., c_null_ptr)
1477 call profiler_end_region("gs_local", 12)
1478 ! Scatter shared dofs
1479 if (pe_size .gt. 1 .and. n .gt. 0) then
1480 call profiler_start_region("gs_nbwait", 7)
1481 call gs%comm%nbwait(gs%shared_gs, l, op, gs%bcknd%gs_stream)
1482 call profiler_end_region("gs_nbwait", 7)
1483 call profiler_start_region("gs_scatter_shared", 15)
1484 if (present(event)) then
1485 call gs%bcknd%scatter(gs%shared_gs, l,&
1486 gs%shared_dof_gs, u, n, &
1487 gs%shared_gs_dof, gs%nshared_blks, &
1488 gs%shared_blk_len, gs%shared_blk_off, .true., event)
1489 else
1490 call gs%bcknd%scatter(gs%shared_gs, l,&
1491 gs%shared_dof_gs, u, n, &
1492 gs%shared_gs_dof, gs%nshared_blks, &
1493 gs%shared_blk_len, gs%shared_blk_off, .true., c_null_ptr)
1494 end if
1495 call profiler_end_region("gs_scatter_shared", 15)
1496 end if
1497
1498 call profiler_end_region("gather_scatter", 5)
1499
1500 end subroutine gs_op_vector
1501
1502end module gather_scatter
recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
Sort the dof lists based on the dof to gather-scatter list.
subroutine gs_find_blks(dg, blk_len, blk_off, nblks, n, m)
Find blocks sharing dofs in non-facet data.
integer function gs_mapping_add_dof(map_, dof, max_id)
Register a unique dof Takes the unique id dof and checks if it is in the htable map_ If it is we retu...
Deassociate a Fortran array from a device pointer.
Definition device.F90:101
Map a Fortran array to a device (allocate and associate)
Definition device.F90:77
Copy data between host and device (or device and device)
Definition device.F90:71
Synchronize a device or stream.
Definition device.F90:113
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:60
integer, public pe_rank
MPI rank.
Definition comm.F90:57
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:44
Device abstraction, common interface for various accelerators.
Definition device.F90:34
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:225
Defines a mapping of the degrees of freedom.
Definition dofmap.f90:35
Defines a field.
Definition field.f90:34
Gather-scatter.
subroutine gs_init_mapping(gs)
Setup mapping of dofs to gather-scatter operations.
subroutine gs_schedule(gs)
Schedule shared gather-scatter operations.
subroutine gs_free(gs)
Deallocate a gather-scatter kernel.
subroutine gs_op_r4(gs, u, n, op, event)
Gather-scatter operation on a rank 4 array.
subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
Initialize a gather-scatter kernel.
subroutine gs_op_vector(gs, u, n, op, event)
Gather-scatter operation on a vector u with op op.
subroutine gs_op_fld(gs, u, op, event)
Gather-scatter operation on a field u with op op.
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
integer, parameter, public gs_bcknd_cpu
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_sx
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_dev
Definition gs_bcknd.f90:40
Defines Coarray Fortran gather-scatter communication.
Definition gs_caf.F90:34
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
integer, parameter, public gs_comm_mpigpu
Definition gs_comm.f90:42
integer, parameter, public gs_comm_mpi
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nvshmem
Definition gs_comm.f90:42
integer, parameter, public gs_comm_openshmem
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nccl
Definition gs_comm.f90:42
integer, parameter, public gs_comm_caf
Definition gs_comm.f90:42
Generic Gather-scatter backend for CPUs.
Definition gs_cpu.f90:34
Defines GPU aware MPI gather-scatter communication.
Defines NCCL based gather-scatter communication.
Defines GPU aware MPI gather-scatter communication.
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
Defines MPI gather-scatter communication.
Definition gs_mpi.f90:34
Defines Gather-scatter operations.
Definition gs_ops.f90:34
integer, parameter, public gs_op_add
Definition gs_ops.f90:36
integer, parameter, public gs_op_max
Definition gs_ops.f90:36
integer, parameter, public gs_op_min
Definition gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition gs_ops.f90:36
Defines OpenSHMEM gather-scatter communication.
Definition gs_shmem.F90:34
Generic Gather-scatter backend for NEC Vector Engines.
Definition gs_sx.f90:34
Implements a hash table ADT.
Definition htable.f90:52
Logging routines.
Definition log.f90:34
type(log_t), public neko_log
Global log stream.
Definition log.f90:80
integer, parameter, public log_size
Definition log.f90:46
Defines a mesh.
Definition mesh.f90:34
Build configurations.
integer, parameter neko_bcknd_sx
integer, parameter neko_bcknd_hip
integer, parameter neko_bcknd_device
integer, parameter neko_bcknd_opencl
logical, parameter neko_device_mpi
integer, parameter neko_bcknd_cuda
integer, parameter, public i2
Definition num_types.f90:5
integer, parameter, public i8
Definition num_types.f90:7
integer, parameter, public dp
Definition num_types.f90:9
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Profiling interface.
Definition profiler.F90:34
subroutine, public profiler_start_region(name, region_id)
Started a named (name) profiler region.
Definition profiler.F90:79
subroutine, public profiler_end_region(name, region_id)
End the most recently started profiler region.
Definition profiler.F90:116
Implements a dynamic stack ADT.
Definition stack.f90:49
Utilities.
Definition utils.f90:35
pure integer function, public linear_index(i, j, k, l, lx, ly, lz)
Compute the address of a (i,j,k,l) array with sizes (1:lx, 1:ly, 1:lz, :)
Definition utils.f90:283
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter communication using Coarray Fortran (F2008). Each image puts directly into the (module...
Definition gs_caf.F90:129
Gather-scatter communication method.
Definition gs_comm.f90:47
Gather-scatter backend for CPUs.
Definition gs_cpu.f90:43
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Gather-scatter communication using MPI.
Definition gs_mpi.f90:62
Gather-scatter communication using OpenSHMEM one-sided puts with per-rank signaling for completion (O...
Definition gs_shmem.F90:91
Gather-scatter backend for NEC SX-Aurora.
Definition gs_sx.f90:43
Integer*8 based hash table.
Definition htable.f90:112
Iterator for an integer*8 based hash table.
Definition htable.f90:195
Integer based stack.
Definition stack.f90:77