Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gather_scatter.f90
Go to the documentation of this file.
1! Copyright (c) 2020-2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
38 use gs_device, only : gs_device_t
39 use gs_sx, only : gs_sx_t
40 use gs_cpu, only : gs_cpu_t
44 use gs_mpi, only : gs_mpi_t
45 use gs_shmem, only : gs_shmem_t
46 use gs_caf, only : gs_caf_t
50 use mesh, only : mesh_t
51 use comm, only : pe_rank, pe_size, neko_comm
52 use mpi_f08, only : mpi_reduce, mpi_allreduce, mpi_barrier, mpi_in_place, &
53 mpi_wait, mpi_irecv, mpi_isend, mpi_wtime, mpi_sum, mpi_max, &
54 mpi_integer, mpi_integer2, mpi_integer8, mpi_request, mpi_status, &
55 mpi_status_ignore, mpi_get_count
56 use dofmap, only : dofmap_t
57 use field, only : field_t
58 use num_types, only : rp, dp, i2, i8
60 use stack, only : stack_i4_t
61 use utils, only : neko_error, linear_index
62 use logger, only : neko_log, log_size
66 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr
67 !$ use omp_lib, only : omp_get_thread_num
68 implicit none
69 private
70
71 type, public :: gs_t
72 real(kind=rp), allocatable :: local_gs(:)
73 integer, allocatable :: local_dof_gs(:)
74 integer, allocatable :: local_gs_dof(:)
75 integer, allocatable :: local_blk_len(:)
76 integer, allocatable :: local_blk_off(:)
77 real(kind=rp), allocatable :: shared_gs(:)
78 integer, allocatable :: shared_dof_gs(:)
79 integer, allocatable :: shared_gs_dof(:)
80 integer, allocatable :: shared_blk_len(:)
81 integer, allocatable :: shared_blk_off(:)
82 type(dofmap_t), pointer ::dofmap
83 type(htable_i8_t) :: shared_dofs
84 integer :: nlocal
85 integer :: nshared
86 integer :: nlocal_blks
87 integer :: nshared_blks
88 integer :: local_facet_offset
89 integer :: shared_facet_offset
90 class(gs_bcknd_t), allocatable :: bcknd
91 class(gs_comm_t), allocatable :: comm
92 contains
93 procedure, private, pass(gs) :: gs_op_fld
94 procedure, private, pass(gs) :: gs_op_r4
95 procedure, pass(gs) :: gs_op_vector
96 procedure, pass(gs) :: init => gs_init
97 procedure, pass(gs) :: free => gs_free
98 generic :: op => gs_op_fld, gs_op_r4, gs_op_vector
99 end type gs_t
100
101 ! Expose available gather-scatter operation
103
104 ! Expose available gather-scatter backends
106
107 ! Expose available gather-scatter comm. backends
110
111
112contains
113
118 subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
119 class(gs_t), intent(inout) :: gs
120 type(dofmap_t), target, intent(inout) :: dofmap
121 character(len=LOG_SIZE) :: log_buf
122 character(len=20) :: bcknd_str
123 integer, optional :: bcknd, comm_bcknd
124 integer :: i, j, ierr, bcknd_, comm_bcknd_
125 integer(i8) :: glb_nshared, glb_nlocal
126 logical :: use_device_mpi, use_device_nccl, use_device_shmem, use_host_mpi
127 logical :: use_host_shmem
128 logical :: use_caf
129 real(kind=rp), allocatable :: tmp(:)
130 type(c_ptr) :: tmp_d = c_null_ptr
131 integer :: strtgy(4) = [int(b'00'), int(b'01'), int(b'10'), int(b'11')]
132 integer :: avg_strtgy, env_len
133 character(len=255) :: env_strtgy, env_gscomm
134 real(kind=dp) :: strtgy_time(4)
135
136 call gs%free()
137
138 call neko_log%section('Gather-Scatter')
139 ! Currently this uses the dofmap which also contains geometric information
140 ! Only connectivity/numbering of points is technically necessary for gs
141 gs%dofmap => dofmap
142
143 use_device_mpi = .false.
144 use_device_nccl = .false.
145 use_device_shmem = .false.
146 use_host_mpi = .false.
147 use_host_shmem = .false.
148 use_caf = .false.
149
150 ! Check if a comm-backend is requested via env. variables
151 call get_environment_variable("NEKO_GS_COMM", env_gscomm, env_len)
152 if (env_len .gt. 0) then
153 if (env_gscomm(1:env_len) .eq. "MPI") then
154 use_host_mpi = .true.
155 else if (env_gscomm(1:env_len) .eq. "MPIGPU") then
156 use_device_mpi = .true.
157 else if (env_gscomm(1:env_len) .eq. "NCCL") then
158 use_device_nccl = .true.
159 else if (env_gscomm(1:env_len) .eq. "SHMEM") then
160 if (neko_bcknd_device .eq. 1) then
161 use_device_shmem = .true.
162 else
163 use_host_shmem = .true.
164 end if
165 else if (env_gscomm(1:env_len) .eq. "CAF") then
166 use_caf = .true.
167 else
168 call neko_error('Unknown Gather-scatter comm. backend')
169 end if
170 end if
171
172
173 if (present(comm_bcknd)) then
174 comm_bcknd_ = comm_bcknd
175 else if (use_host_mpi) then
176 comm_bcknd_ = gs_comm_mpi
177 else if (use_device_mpi) then
178 comm_bcknd_ = gs_comm_mpigpu
179 else if (use_device_nccl) then
180 comm_bcknd_ = gs_comm_nccl
181 else if (use_device_shmem) then
182 comm_bcknd_ = gs_comm_nvshmem
183 else if (use_host_shmem) then
184 comm_bcknd_ = gs_comm_openshmem
185 else if (use_caf) then
186 comm_bcknd_ = gs_comm_caf
187 else
188 if (neko_device_mpi) then
189 comm_bcknd_ = gs_comm_mpigpu
190 use_device_mpi = .true.
191 else
192 comm_bcknd_ = gs_comm_mpi
193 end if
194 end if
195
196 select case (comm_bcknd_)
197 case (gs_comm_mpi)
198 call neko_log%message('Comm : MPI')
199 allocate(gs_mpi_t::gs%comm)
200 case (gs_comm_mpigpu)
201 call neko_log%message('Comm : Device MPI')
202 allocate(gs_device_mpi_t::gs%comm)
203 case (gs_comm_nccl)
204 call neko_log%message('Comm : NCCL')
205 allocate(gs_device_nccl_t::gs%comm)
206 case (gs_comm_nvshmem)
207 call neko_log%message('Comm : NVSHMEM')
208 allocate(gs_device_shmem_t::gs%comm)
209 case (gs_comm_openshmem)
210 call neko_log%message('Comm : OpenSHMEM')
211 allocate(gs_shmem_t::gs%comm)
212 case (gs_comm_caf)
213 call neko_log%message('Comm : CAF')
214 allocate(gs_caf_t::gs%comm)
215 case default
216 call neko_error('Unknown Gather-scatter comm. backend')
217 end select
218 ! Initialize a stack for each rank containing which dofs to send/recv at
219 ! that rank
220 call gs%comm%init_dofs()
221 ! Initialize mapping between local ids and gather-scatter ids
222 ! based on the global numbering in dofmap
223 call gs_init_mapping(gs)
224 ! Setup buffers and which ranks to send/recv data from based on mapping
225 ! and initializes gs%comm (sets up gs%comm%send_dof and gs%comm%recv_dof and
226 ! recv_pe/send_pe)
227 call gs_schedule(gs)
228 ! Global number of points not needing to be sent over mpi for gs operations
229 ! "Internal points"
230 glb_nlocal = int(gs%nlocal, i8)
231 ! Global number of points needing to be communicated with other pes/ranks
232 ! "external points"
233 glb_nshared = int(gs%nshared, i8)
234 ! Can be thought of a measure of the volume of this rank (glb_nlocal) and
235 ! the surface area (glb_nshared) that is shared with other ranks
236 ! Lots of internal volume compared to surface that needs communication is
237 ! good
238
239 if (pe_rank .eq. 0) then
240 call mpi_reduce(mpi_in_place, glb_nlocal, 1, &
241 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
242
243 call mpi_reduce(mpi_in_place, glb_nshared, 1, &
244 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
245 else
246 call mpi_reduce(glb_nlocal, glb_nlocal, 1, &
247 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
248
249 call mpi_reduce(glb_nshared, glb_nshared, 1, &
250 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
251 end if
252
253 write(log_buf, '(A,I12)') 'Avg. internal: ', glb_nlocal/pe_size
254 call neko_log%message(log_buf)
255 write(log_buf, '(A,I12)') 'Avg. external: ', glb_nshared/pe_size
256 call neko_log%message(log_buf)
257
258 if (present(bcknd)) then
259 bcknd_ = bcknd
260 else
261 if (neko_bcknd_sx .eq. 1) then
262 bcknd_ = gs_bcknd_sx
263 else if (neko_bcknd_device .eq. 1) then
264 bcknd_ = gs_bcknd_dev
265 else
266 bcknd_ = gs_bcknd_cpu
267 end if
268 end if
269
270 ! Setup Gather-scatter backend
271 select case (bcknd_)
272 case (gs_bcknd_cpu)
273 allocate(gs_cpu_t::gs%bcknd)
274 bcknd_str = ' std'
275 case (gs_bcknd_dev)
276 allocate(gs_device_t::gs%bcknd)
277 if (neko_bcknd_hip .eq. 1) then
278 bcknd_str = ' hip'
279 else if (neko_bcknd_cuda .eq. 1) then
280 bcknd_str = ' cuda'
281 else if (neko_bcknd_opencl .eq. 1) then
282 bcknd_str = ' opencl'
283 else if (neko_bcknd_metal .eq. 1) then
284 bcknd_str = ' metal'
285 end if
286 case (gs_bcknd_sx)
287 allocate(gs_sx_t::gs%bcknd)
288 bcknd_str = ' sx'
289 case default
290 call neko_error('Unknown Gather-scatter backend')
291 end select
292
293 write(log_buf, '(A)') 'Backend : ' // trim(bcknd_str)
294 call neko_log%message(log_buf)
295
296
297 call gs%bcknd%init(gs%nlocal, gs%nshared, gs%nlocal_blks, gs%nshared_blks)
298
299 if (use_device_mpi .or. use_device_nccl .or. use_device_shmem) then
300 select type (b => gs%bcknd)
301 type is (gs_device_t)
302 b%shared_on_host = .false.
303 end select
304 end if
305
306 if (use_device_mpi) then
307 if (pe_size .gt. 1) then
308 ! Select fastest device MPI strategy at runtime
309 select type (c => gs%comm)
310 type is (gs_device_mpi_t)
311 call get_environment_variable("NEKO_GS_STRTGY", env_strtgy, &
312 env_len)
313 if (env_len .eq. 0) then
314 allocate(tmp(dofmap%size()))
315 call device_map(tmp, tmp_d, dofmap%size())
316 tmp = 1.0_rp
317 call device_memcpy(tmp, tmp_d, dofmap%size(), &
318 host_to_device, sync = .false.)
319 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
320
321 do i = 1, size(strtgy)
322 c%nb_strtgy = strtgy(i)
323 call device_sync
324 call mpi_barrier(neko_comm)
325 strtgy_time(i) = mpi_wtime()
326 do j = 1, 100
327 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
328 end do
329 strtgy_time(i) = (mpi_wtime() - strtgy_time(i)) / 100d0
330 end do
331
332 call device_unmap(tmp, tmp_d)
333 deallocate(tmp)
334
335 c%nb_strtgy = strtgy(minloc(strtgy_time, 1))
336
337 avg_strtgy = minloc(strtgy_time, 1)
338 call mpi_allreduce(mpi_in_place, avg_strtgy, 1, &
339 mpi_integer, mpi_sum, neko_comm)
340 avg_strtgy = avg_strtgy / pe_size
341
342 write(log_buf, '(A,B0.2,A)') 'Avg. strtgy : [', &
343 strtgy(avg_strtgy), ']'
344
345 else
346 read(env_strtgy(1:env_len), *) i
347
348 if (i .lt. 1 .or. i .gt. 4) then
349 call neko_error('Invalid gs sync strtgy')
350 end if
351
352 c%nb_strtgy = strtgy(i)
353 avg_strtgy = i
354
355 write(log_buf, '(A,B0.2,A)') 'Env. strtgy : [', &
356 strtgy(avg_strtgy), ']'
357 end if
358
359 call neko_log%message(log_buf)
360
361 end select
362 end if
363 end if
364
365 call neko_log%end_section()
366
367 end subroutine gs_init
368
370 subroutine gs_free(gs)
371 class(gs_t), intent(inout) :: gs
372
373 nullify(gs%dofmap)
374
375 if (allocated(gs%local_gs)) then
376 deallocate(gs%local_gs)
377 end if
378
379 if (allocated(gs%local_dof_gs)) then
380 deallocate(gs%local_dof_gs)
381 end if
382
383 if (allocated(gs%local_gs_dof)) then
384 deallocate(gs%local_gs_dof)
385 end if
386
387 if (allocated(gs%local_blk_len)) then
388 deallocate(gs%local_blk_len)
389 end if
390
391 if (allocated(gs%local_blk_off)) then
392 deallocate(gs%local_blk_off)
393 end if
394
395 if (allocated(gs%shared_gs)) then
396 deallocate(gs%shared_gs)
397 end if
398
399 if (allocated(gs%shared_dof_gs)) then
400 deallocate(gs%shared_dof_gs)
401 end if
402
403 if (allocated(gs%shared_gs_dof)) then
404 deallocate(gs%shared_gs_dof)
405 end if
406
407 if (allocated(gs%shared_blk_len)) then
408 deallocate(gs%shared_blk_len)
409 end if
410
411 if (allocated(gs%shared_blk_off)) then
412 deallocate(gs%shared_blk_off)
413 end if
414
415 gs%nlocal = 0
416 gs%nshared = 0
417 gs%nlocal_blks = 0
418 gs%nshared_blks = 0
419
420 call gs%shared_dofs%free()
421
422 if (allocated(gs%bcknd)) then
423 call gs%bcknd%free()
424 deallocate(gs%bcknd)
425 end if
426
427 if (allocated(gs%comm)) then
428 call gs%comm%free()
429 deallocate(gs%comm)
430 end if
431
432 end subroutine gs_free
433
435 subroutine gs_init_mapping(gs)
436 type(gs_t), target, intent(inout) :: gs
437 type(mesh_t), pointer :: msh
438 type(dofmap_t), pointer :: dofmap
439 type(stack_i4_t), target :: local_dof, dof_local, shared_dof, dof_shared
440 type(stack_i4_t), target :: local_face_dof, face_dof_local
441 type(stack_i4_t), target :: shared_face_dof, face_dof_shared
442 integer :: i, j, k, l, lx, ly, lz, max_id, max_sid, id, lid, dm_size
443 type(htable_i8_t) :: dm
444 type(htable_i8_t), pointer :: sdm
445
446 dofmap => gs%dofmap
447 msh => dofmap%msh
448 sdm => gs%shared_dofs
449
450 lx = dofmap%Xh%lx
451 ly = dofmap%Xh%ly
452 lz = dofmap%Xh%lz
453 dm_size = dofmap%size()/lx
454
455 call dm%init(dm_size, i)
459 call sdm%init(dofmap%size(), i)
460
461
462 call local_dof%init()
463 call dof_local%init()
464
465 call local_face_dof%init()
466 call face_dof_local%init()
467
468 call shared_dof%init()
469 call dof_shared%init()
470
471 call shared_face_dof%init()
472 call face_dof_shared%init()
473
474 !
475 ! Setup mapping for dofs points
476 !
477
478 max_id = 0
479 max_sid = 0
480 do i = 1, msh%nelv
481 ! Local id of vertices
482 lid = linear_index(1, 1, 1, i, lx, ly, lz)
483 ! Check if this dof is shared among ranks or not
484 if (dofmap%shared_dof(1, 1, 1, i)) then
485 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, 1, i), max_sid)
486 !If add unique gather-scatter id to shared_dof stack
487 call shared_dof%push(id)
488 !If add local id to dof_shared stack
489 call dof_shared%push(lid)
490 !Now we have the mapping of local id <-> gather scatter id!
491 else
492 ! Same here, only here we know the point is local
493 ! It will as such not need to be sent to other ranks later
494 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, 1, i), max_id)
495 call local_dof%push(id)
496 call dof_local%push(lid)
497 end if
498 ! This procedure is then repeated for all vertices and edges
499 ! Facets can be treated a little bit differently since they only have one
500 ! neighbor
501
502 lid = linear_index(lx, 1, 1, i, lx, ly, lz)
503 if (dofmap%shared_dof(lx, 1, 1, i)) then
504 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, 1, i), max_sid)
505 call shared_dof%push(id)
506 call dof_shared%push(lid)
507 else
508 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, 1, i), max_id)
509 call local_dof%push(id)
510 call dof_local%push(lid)
511 end if
512
513 lid = linear_index(1, ly, 1, i, lx, ly, lz)
514 if (dofmap%shared_dof(1, ly, 1, i)) then
515 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, 1, i), max_sid)
516 call shared_dof%push(id)
517 call dof_shared%push(lid)
518 else
519 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, 1, i), max_id)
520 call local_dof%push(id)
521 call dof_local%push(lid)
522 end if
523
524 lid = linear_index(lx, ly, 1, i, lx, ly, lz)
525 if (dofmap%shared_dof(lx, ly, 1, i)) then
526 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, 1, i), max_sid)
527 call shared_dof%push(id)
528 call dof_shared%push(lid)
529 else
530 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, 1, i), max_id)
531 call local_dof%push(id)
532 call dof_local%push(lid)
533 end if
534 if (lz .gt. 1) then
535 lid = linear_index(1, 1, lz, i, lx, ly, lz)
536 if (dofmap%shared_dof(1, 1, lz, i)) then
537 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, lz, i), max_sid)
538 call shared_dof%push(id)
539 call dof_shared%push(lid)
540 else
541 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, lz, i), max_id)
542 call local_dof%push(id)
543 call dof_local%push(lid)
544 end if
545
546 lid = linear_index(lx, 1, lz, i, lx, ly, lz)
547 if (dofmap%shared_dof(lx, 1, lz, i)) then
548 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, lz, i), max_sid)
549 call shared_dof%push(id)
550 call dof_shared%push(lid)
551 else
552 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, lz, i), max_id)
553 call local_dof%push(id)
554 call dof_local%push(lid)
555 end if
556
557 lid = linear_index(1, ly, lz, i, lx, ly, lz)
558 if (dofmap%shared_dof(1, ly, lz, i)) then
559 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, lz, i), max_sid)
560 call shared_dof%push(id)
561 call dof_shared%push(lid)
562 else
563 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, lz, i), max_id)
564 call local_dof%push(id)
565 call dof_local%push(lid)
566 end if
567
568 lid = linear_index(lx, ly, lz, i, lx, ly, lz)
569 if (dofmap%shared_dof(lx, ly, lz, i)) then
570 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, lz, i), max_sid)
571 call shared_dof%push(id)
572 call dof_shared%push(lid)
573 else
574 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, lz, i), max_id)
575 call local_dof%push(id)
576 call dof_local%push(lid)
577 end if
578 end if
579 end do
580
581 ! Clear local dofmap table
582 call dm%clear()
583 ! Get gather scatter ids and local ids of edges
584 if (lz .gt. 1) then
585 !
586 ! Setup mapping for dofs on edges
587 !
588 do i = 1, msh%nelv
589
590 !
591 ! dofs on edges in x-direction
592 !
593 if (dofmap%shared_dof(2, 1, 1, i)) then
594 do j = 2, lx - 1
595 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
596 call shared_dof%push(id)
597 id = linear_index(j, 1, 1, i, lx, ly, lz)
598 call dof_shared%push(id)
599 end do
600 else
601 do j = 2, lx - 1
602 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
603 call local_dof%push(id)
604 id = linear_index(j, 1, 1, i, lx, ly, lz)
605 call dof_local%push(id)
606 end do
607 end if
608 if (dofmap%shared_dof(2, 1, lz, i)) then
609 do j = 2, lx - 1
610 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, lz, i), max_sid)
611 call shared_dof%push(id)
612 id = linear_index(j, 1, lz, i, lx, ly, lz)
613 call dof_shared%push(id)
614 end do
615 else
616 do j = 2, lx - 1
617 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, lz, i), max_id)
618 call local_dof%push(id)
619 id = linear_index(j, 1, lz, i, lx, ly, lz)
620 call dof_local%push(id)
621 end do
622 end if
623
624 if (dofmap%shared_dof(2, ly, 1, i)) then
625 do j = 2, lx - 1
626 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
627 call shared_dof%push(id)
628 id = linear_index(j, ly, 1, i, lx, ly, lz)
629 call dof_shared%push(id)
630 end do
631
632 else
633 do j = 2, lx - 1
634 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
635 call local_dof%push(id)
636 id = linear_index(j, ly, 1, i, lx, ly, lz)
637 call dof_local%push(id)
638 end do
639 end if
640 if (dofmap%shared_dof(2, ly, lz, i)) then
641 do j = 2, lx - 1
642 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, lz, i), max_sid)
643 call shared_dof%push(id)
644 id = linear_index(j, ly, lz, i, lx, ly, lz)
645 call dof_shared%push(id)
646 end do
647 else
648 do j = 2, lx - 1
649 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, lz, i), max_id)
650 call local_dof%push(id)
651 id = linear_index(j, ly, lz, i, lx, ly, lz)
652 call dof_local%push(id)
653 end do
654 end if
655
656 !
657 ! dofs on edges in y-direction
658 !
659 if (dofmap%shared_dof(1, 2, 1, i)) then
660 do k = 2, ly - 1
661 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
662 call shared_dof%push(id)
663 id = linear_index(1, k, 1, i, lx, ly, lz)
664 call dof_shared%push(id)
665 end do
666 else
667 do k = 2, ly - 1
668 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
669 call local_dof%push(id)
670 id = linear_index(1, k, 1, i, lx, ly, lz)
671 call dof_local%push(id)
672 end do
673 end if
674 if (dofmap%shared_dof(1, 2, lz, i)) then
675 do k = 2, ly - 1
676 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, lz, i), max_sid)
677 call shared_dof%push(id)
678 id = linear_index(1, k, lz, i, lx, ly, lz)
679 call dof_shared%push(id)
680 end do
681 else
682 do k = 2, ly - 1
683 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, lz, i), max_id)
684 call local_dof%push(id)
685 id = linear_index(1, k, lz, i, lx, ly, lz)
686 call dof_local%push(id)
687 end do
688 end if
689
690 if (dofmap%shared_dof(lx, 2, 1, i)) then
691 do k = 2, ly - 1
692 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
693 call shared_dof%push(id)
694 id = linear_index(lx, k, 1, i, lx, ly, lz)
695 call dof_shared%push(id)
696 end do
697 else
698 do k = 2, ly - 1
699 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
700 call local_dof%push(id)
701 id = linear_index(lx, k, 1, i, lx, ly, lz)
702 call dof_local%push(id)
703 end do
704 end if
705 if (dofmap%shared_dof(lx, 2, lz, i)) then
706 do k = 2, ly - 1
707 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, lz, i), max_sid)
708 call shared_dof%push(id)
709 id = linear_index(lx, k, lz, i, lx, ly, lz)
710 call dof_shared%push(id)
711 end do
712 else
713 do k = 2, ly - 1
714 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, lz, i), max_id)
715 call local_dof%push(id)
716 id = linear_index(lx, k, lz, i, lx, ly, lz)
717 call dof_local%push(id)
718 end do
719 end if
720 !
721 ! dofs on edges in z-direction
722 !
723 if (dofmap%shared_dof(1, 1, 2, i)) then
724 do l = 2, lz - 1
725 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, l, i), max_sid)
726 call shared_dof%push(id)
727 id = linear_index(1, 1, l, i, lx, ly, lz)
728 call dof_shared%push(id)
729 end do
730 else
731 do l = 2, lz - 1
732 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, l, i), max_id)
733 call local_dof%push(id)
734 id = linear_index(1, 1, l, i, lx, ly, lz)
735 call dof_local%push(id)
736 end do
737 end if
738
739 if (dofmap%shared_dof(lx, 1, 2, i)) then
740 do l = 2, lz - 1
741 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, l, i), max_sid)
742 call shared_dof%push(id)
743 id = linear_index(lx, 1, l, i, lx, ly, lz)
744 call dof_shared%push(id)
745 end do
746 else
747 do l = 2, lz - 1
748 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, l, i), max_id)
749 call local_dof%push(id)
750 id = linear_index(lx, 1, l, i, lx, ly, lz)
751 call dof_local%push(id)
752 end do
753 end if
754
755 if (dofmap%shared_dof(1, ly, 2, i)) then
756 do l = 2, lz - 1
757 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, l, i), max_sid)
758 call shared_dof%push(id)
759 id = linear_index(1, ly, l, i, lx, ly, lz)
760 call dof_shared%push(id)
761 end do
762 else
763 do l = 2, lz - 1
764 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, l, i), max_id)
765 call local_dof%push(id)
766 id = linear_index(1, ly, l, i, lx, ly, lz)
767 call dof_local%push(id)
768 end do
769 end if
770
771 if (dofmap%shared_dof(lx, ly, 2, i)) then
772 do l = 2, lz - 1
773 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, l, i), max_sid)
774 call shared_dof%push(id)
775 id = linear_index(lx, ly, l, i, lx, ly, lz)
776 call dof_shared%push(id)
777 end do
778 else
779 do l = 2, lz - 1
780 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, l, i), max_id)
781 call local_dof%push(id)
782 id = linear_index(lx, ly, l, i, lx, ly, lz)
783 call dof_local%push(id)
784 end do
785 end if
786 end do
787 end if
788
789 ! Clear local dofmap table
790 call dm%clear()
791
792 !
793 ! Setup mapping for dofs on facets
794 !
795 ! This is for 2d
796 if (lz .eq. 1) then
797 do i = 1, msh%nelv
798
799 !
800 ! dofs on edges in x-direction
801 !
802 if (msh%facet_neigh(3, i) .ne. 0) then
803 if (dofmap%shared_dof(2, 1, 1, i)) then
804 do j = 2, lx - 1
805 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
806 call shared_face_dof%push(id)
807 id = linear_index(j, 1, 1, i, lx, ly, lz)
808 call face_dof_shared%push(id)
809 end do
810 else
811 do j = 2, lx - 1
812 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
813 call local_face_dof%push(id)
814 id = linear_index(j, 1, 1, i, lx, ly, lz)
815 call face_dof_local%push(id)
816 end do
817 end if
818 end if
819
820 if (msh%facet_neigh(4, i) .ne. 0) then
821 if (dofmap%shared_dof(2, ly, 1, i)) then
822 do j = 2, lx - 1
823 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), &
824 max_sid)
825 call shared_face_dof%push(id)
826 id = linear_index(j, ly, 1, i, lx, ly, lz)
827 call face_dof_shared%push(id)
828 end do
829
830 else
831 do j = 2, lx - 1
832 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), &
833 max_id)
834 call local_face_dof%push(id)
835 id = linear_index(j, ly, 1, i, lx, ly, lz)
836 call face_dof_local%push(id)
837 end do
838 end if
839 end if
840
841 !
842 ! dofs on edges in y-direction
843 !
844 if (msh%facet_neigh(1, i) .ne. 0) then
845 if (dofmap%shared_dof(1, 2, 1, i)) then
846 do k = 2, ly - 1
847 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
848 call shared_face_dof%push(id)
849 id = linear_index(1, k, 1, i, lx, ly, lz)
850 call face_dof_shared%push(id)
851 end do
852 else
853 do k = 2, ly - 1
854 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
855 call local_face_dof%push(id)
856 id = linear_index(1, k, 1, i, lx, ly, lz)
857 call face_dof_local%push(id)
858 end do
859 end if
860 end if
861
862 if (msh%facet_neigh(2, i) .ne. 0) then
863 if (dofmap%shared_dof(lx, 2, 1, i)) then
864 do k = 2, ly - 1
865 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), &
866 max_sid)
867 call shared_face_dof%push(id)
868 id = linear_index(lx, k, 1, i, lx, ly, lz)
869 call face_dof_shared%push(id)
870 end do
871 else
872 do k = 2, ly - 1
873 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), &
874 max_id)
875 call local_face_dof%push(id)
876 id = linear_index(lx, k, 1, i, lx, ly, lz)
877 call face_dof_local%push(id)
878 end do
879 end if
880 end if
881 end do
882 else
883 do i = 1, msh%nelv
884
885 ! Facets in x-direction (s, t)-plane
886 if (msh%facet_neigh(1, i) .ne. 0) then
887 if (dofmap%shared_dof(1, 2, 2, i)) then
888 do l = 2, lz - 1
889 do k = 2, ly - 1
890 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, l, i), &
891 max_sid)
892 call shared_face_dof%push(id)
893 id = linear_index(1, k, l, i, lx, ly, lz)
894 call face_dof_shared%push(id)
895 end do
896 end do
897 else
898 do l = 2, lz - 1
899 do k = 2, ly - 1
900 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, l, i), &
901 max_id)
902 call local_face_dof%push(id)
903 id = linear_index(1, k, l, i, lx, ly, lz)
904 call face_dof_local%push(id)
905 end do
906 end do
907 end if
908 end if
909
910 if (msh%facet_neigh(2, i) .ne. 0) then
911 if (dofmap%shared_dof(lx, 2, 2, i)) then
912 do l = 2, lz - 1
913 do k = 2, ly - 1
914 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, l, i), &
915 max_sid)
916 call shared_face_dof%push(id)
917 id = linear_index(lx, k, l, i, lx, ly, lz)
918 call face_dof_shared%push(id)
919 end do
920 end do
921 else
922 do l = 2, lz - 1
923 do k = 2, ly - 1
924 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, l, i), &
925 max_id)
926 call local_face_dof%push(id)
927 id = linear_index(lx, k, l, i, lx, ly, lz)
928 call face_dof_local%push(id)
929 end do
930 end do
931 end if
932 end if
933
934 ! Facets in y-direction (r, t)-plane
935 if (msh%facet_neigh(3, i) .ne. 0) then
936 if (dofmap%shared_dof(2, 1, 2, i)) then
937 do l = 2, lz - 1
938 do j = 2, lx - 1
939 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, l, i), &
940 max_sid)
941 call shared_face_dof%push(id)
942 id = linear_index(j, 1, l, i, lx, ly, lz)
943 call face_dof_shared%push(id)
944 end do
945 end do
946 else
947 do l = 2, lz - 1
948 do j = 2, lx - 1
949 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, l, i), &
950 max_id)
951 call local_face_dof%push(id)
952 id = linear_index(j, 1, l, i, lx, ly, lz)
953 call face_dof_local%push(id)
954 end do
955 end do
956 end if
957 end if
958
959 if (msh%facet_neigh(4, i) .ne. 0) then
960 if (dofmap%shared_dof(2, ly, 2, i)) then
961 do l = 2, lz - 1
962 do j = 2, lx - 1
963 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, l, i), &
964 max_sid)
965 call shared_face_dof%push(id)
966 id = linear_index(j, ly, l, i, lx, ly, lz)
967 call face_dof_shared%push(id)
968 end do
969 end do
970 else
971 do l = 2, lz - 1
972 do j = 2, lx - 1
973 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, l, i), &
974 max_id)
975 call local_face_dof%push(id)
976 id = linear_index(j, ly, l, i, lx, ly, lz)
977 call face_dof_local%push(id)
978 end do
979 end do
980 end if
981 end if
982
983 ! Facets in z-direction (r, s)-plane
984 if (msh%facet_neigh(5, i) .ne. 0) then
985 if (dofmap%shared_dof(2, 2, 1, i)) then
986 do k = 2, ly - 1
987 do j = 2, lx - 1
988 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, 1, i), &
989 max_sid)
990 call shared_face_dof%push(id)
991 id = linear_index(j, k, 1, i, lx, ly, lz)
992 call face_dof_shared%push(id)
993 end do
994 end do
995 else
996 do k = 2, ly - 1
997 do j = 2, lx - 1
998 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, 1, i), &
999 max_id)
1000 call local_face_dof%push(id)
1001 id = linear_index(j, k, 1, i, lx, ly, lz)
1002 call face_dof_local%push(id)
1003 end do
1004 end do
1005 end if
1006 end if
1007
1008 if (msh%facet_neigh(6, i) .ne. 0) then
1009 if (dofmap%shared_dof(2, 2, lz, i)) then
1010 do k = 2, ly - 1
1011 do j = 2, lx - 1
1012 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, lz, i), &
1013 max_sid)
1014 call shared_face_dof%push(id)
1015 id = linear_index(j, k, lz, i, lx, ly, lz)
1016 call face_dof_shared%push(id)
1017 end do
1018 end do
1019 else
1020 do k = 2, ly - 1
1021 do j = 2, lx - 1
1022 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, lz, i), &
1023 max_id)
1024 call local_face_dof%push(id)
1025 id = linear_index(j, k, lz, i, lx, ly, lz)
1026 call face_dof_local%push(id)
1027 end do
1028 end do
1029 end if
1030 end if
1031 end do
1032 end if
1033
1034
1035 call dm%free()
1036
1037 gs%nlocal = local_dof%size() + local_face_dof%size()
1038 gs%local_facet_offset = local_dof%size() + 1
1039
1040 ! Finalize local dof to gather-scatter index
1041 allocate(gs%local_dof_gs(gs%nlocal))
1042
1043 ! Add dofs on points and edges
1044
1045 ! We should use the %array() procedure, which works great for
1046 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1047 ! certain data types
1048 select type (dof_array => local_dof%data)
1049 type is (integer)
1050 j = local_dof%size()
1051 do i = 1, j
1052 gs%local_dof_gs(i) = dof_array(i)
1053 end do
1054 end select
1055 call local_dof%free()
1056
1057 ! Add dofs on faces
1058
1059 ! We should use the %array() procedure, which works great for
1060 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1061 ! certain data types
1062 select type (dof_array => local_face_dof%data)
1063 type is (integer)
1064 do i = 1, local_face_dof%size()
1065 gs%local_dof_gs(i + j) = dof_array(i)
1066 end do
1067 end select
1068 call local_face_dof%free()
1069
1070 ! Finalize local gather-scatter index to dof
1071 allocate(gs%local_gs_dof(gs%nlocal))
1072
1073 ! Add gather-scatter index on points and edges
1074
1075 ! We should use the %array() procedure, which works great for
1076 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1077 ! certain data types
1078 select type (dof_array => dof_local%data)
1079 type is (integer)
1080 j = dof_local%size()
1081 do i = 1, j
1082 gs%local_gs_dof(i) = dof_array(i)
1083 end do
1084 end select
1085 call dof_local%free()
1086
1087 ! We should use the %array() procedure, which works great for
1088 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1089 ! certain data types
1090 select type (dof_array => face_dof_local%data)
1091 type is (integer)
1092 do i = 1, face_dof_local%size()
1093 gs%local_gs_dof(i+j) = dof_array(i)
1094 end do
1095 end select
1096 call face_dof_local%free()
1097
1098 call gs_qsort_dofmap(gs%local_dof_gs, gs%local_gs_dof, &
1099 gs%nlocal, 1, gs%nlocal)
1100
1101 call gs_find_blks(gs%local_dof_gs, gs%local_blk_len, &
1102 gs%local_blk_off, gs%nlocal_blks, gs%nlocal, gs%local_facet_offset)
1103
1104 ! Allocate buffer for local gs-ops
1105 allocate(gs%local_gs(gs%nlocal))
1106
1107 gs%nshared = shared_dof%size() + shared_face_dof%size()
1108 gs%shared_facet_offset = shared_dof%size() + 1
1109
1110 ! Finalize shared dof to gather-scatter index
1111 allocate(gs%shared_dof_gs(gs%nshared))
1112
1113 ! Add shared dofs on points and edges
1114
1115 ! We should use the %array() procedure, which works great for
1116 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1117 ! certain data types
1118 select type (dof_array => shared_dof%data)
1119 type is (integer)
1120 j = shared_dof%size()
1121 do i = 1, j
1122 gs%shared_dof_gs(i) = dof_array(i)
1123 end do
1124 end select
1125 call shared_dof%free()
1126
1127 ! Add shared dofs on faces
1128
1129 ! We should use the %array() procedure, which works great for
1130 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1131 ! certain data types
1132 select type (dof_array => shared_face_dof%data)
1133 type is (integer)
1134 do i = 1, shared_face_dof%size()
1135 gs%shared_dof_gs(i + j) = dof_array(i)
1136 end do
1137 end select
1138 call shared_face_dof%free()
1139
1140 ! Finalize shared gather-scatter index to dof
1141 allocate(gs%shared_gs_dof(gs%nshared))
1142
1143 ! Add dofs on points and edges
1144
1145 ! We should use the %array() procedure, which works great for
1146 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1147 ! certain data types
1148 select type (dof_array => dof_shared%data)
1149 type is (integer)
1150 j = dof_shared%size()
1151 do i = 1, j
1152 gs%shared_gs_dof(i) = dof_array(i)
1153 end do
1154 end select
1155 call dof_shared%free()
1156
1157 ! We should use the %array() procedure, which works great for
1158 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1159 ! certain data types
1160 select type (dof_array => face_dof_shared%data)
1161 type is (integer)
1162 do i = 1, face_dof_shared%size()
1163 gs%shared_gs_dof(i + j) = dof_array(i)
1164 end do
1165 end select
1166 call face_dof_shared%free()
1167
1168 ! Allocate buffer for shared gs-ops
1169 allocate(gs%shared_gs(gs%nshared))
1170
1171 if (gs%nshared .gt. 0) then
1172 call gs_qsort_dofmap(gs%shared_dof_gs, gs%shared_gs_dof, &
1173 gs%nshared, 1, gs%nshared)
1174
1175 call gs_find_blks(gs%shared_dof_gs, gs%shared_blk_len, &
1176 gs%shared_blk_off, gs%nshared_blks, gs%nshared, &
1177 gs%shared_facet_offset)
1178 end if
1179
1180 contains
1181
1190 function gs_mapping_add_dof(map_, dof, max_id) result(id)
1191 type(htable_i8_t), intent(inout) :: map_
1192 integer(kind=i8), intent(inout) :: dof
1193 integer, intent(inout) :: max_id
1194 integer :: id
1195
1196 if (map_%get(dof, id) .gt. 0) then
1197 max_id = max_id + 1
1198 call map_%set(dof, max_id)
1199 id = max_id
1200 end if
1201
1202 end function gs_mapping_add_dof
1203
1205 recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
1206 integer, intent(inout) :: n
1207 integer, dimension(n), intent(inout) :: dg
1208 integer, dimension(n), intent(inout) :: gd
1209 integer :: lo, hi
1210 integer :: tmp, i, j, pivot
1211
1212 i = lo - 1
1213 j = hi + 1
1214 pivot = dg((lo + hi) / 2)
1215 do
1216 do
1217 i = i + 1
1218 if (dg(i) .ge. pivot) exit
1219 end do
1220
1221 do
1222 j = j - 1
1223 if (dg(j) .le. pivot) exit
1224 end do
1225
1226 if (i .lt. j) then
1227 tmp = dg(i)
1228 dg(i) = dg(j)
1229 dg(j) = tmp
1230
1231 tmp = gd(i)
1232 gd(i) = gd(j)
1233 gd(j) = tmp
1234 else if (i .eq. j) then
1235 i = i + 1
1236 exit
1237 else
1238 exit
1239 end if
1240 end do
1241 if (lo .lt. j) call gs_qsort_dofmap(dg, gd, n, lo, j)
1242 if (i .lt. hi) call gs_qsort_dofmap(dg, gd, n, i, hi)
1243
1244 end subroutine gs_qsort_dofmap
1245
1247 subroutine gs_find_blks(dg, blk_len, blk_off, nblks, n, m)
1248 integer, intent(in) :: n
1249 integer, intent(in) :: m
1250 integer, dimension(n), intent(inout) :: dg
1251 integer, allocatable, intent(inout) :: blk_len(:)
1252 integer, allocatable, intent(inout) :: blk_off(:)
1253 integer, intent(inout) :: nblks
1254 integer :: i, j
1255 integer :: id, count
1256 type(stack_i4_t), target :: blks
1257
1258 call blks%init()
1259 i = 1
1260 do while (i .lt. m)
1261 id = dg(i)
1262 count = 1
1263 j = i
1264 do while ( j+1 .le. n .and. dg(j+1) .eq. id)
1265 j = j + 1
1266 count = count + 1
1267 end do
1268 call blks%push(count)
1269 i = j + 1
1270 end do
1271
1272 select type (blk_array => blks%data)
1273 type is (integer)
1274 nblks = blks%size()
1275 allocate(blk_len(nblks))
1276 do i = 1, nblks
1277 blk_len(i) = blk_array(i)
1278 end do
1279 allocate(blk_off(nblks))
1280 blk_off(1) = 0
1281 do i = 2, nblks
1282 blk_off(i) = blk_off(i - 1) + blk_len(i - 1)
1283 end do
1284 end select
1285 call blks%free()
1286
1287 end subroutine gs_find_blks
1288
1289 end subroutine gs_init_mapping
1290
1292 subroutine gs_schedule(gs)
1293 type(gs_t), target, intent(inout) :: gs
1294 integer(kind=i8), allocatable :: send_buf(:), recv_buf(:)
1295 integer(kind=i2), allocatable :: shared_flg(:), recv_flg(:)
1296 type(htable_iter_i8_t) :: it
1297 type(stack_i4_t) :: send_pe, recv_pe
1298 type(mpi_status) :: status
1299 type(mpi_request) :: send_req, recv_req
1300 integer :: i, j, max_recv, src, dst, ierr, n_recv
1301 integer :: tmp, shared_gs_id
1302 integer :: nshared_unique
1303
1304 nshared_unique = gs%shared_dofs%num_entries()
1305
1306 call it%init(gs%shared_dofs)
1307 allocate(send_buf(nshared_unique))
1308 i = 1
1309 do while (it%next())
1310 send_buf(i) = it%key()
1311 i = i + 1
1312 end do
1313
1314 call send_pe%init()
1315 call recv_pe%init()
1316
1317
1318 !
1319 ! Schedule exchange of shared dofs
1320 !
1321
1322 call mpi_allreduce(nshared_unique, max_recv, 1, &
1323 mpi_integer, mpi_max, neko_comm, ierr)
1324
1325 allocate(recv_buf(max_recv))
1326 allocate(shared_flg(max_recv))
1327 allocate(recv_flg(max_recv))
1328
1330 do i = 1, size(gs%dofmap%msh%neigh_order)
1331 src = modulo(pe_rank - gs%dofmap%msh%neigh_order(i) + pe_size, pe_size)
1332 dst = modulo(pe_rank + gs%dofmap%msh%neigh_order(i), pe_size)
1333
1334 if (gs%dofmap%msh%neigh(src)) then
1335 call mpi_irecv(recv_buf, max_recv, mpi_integer8, &
1336 src, 0, neko_comm, recv_req, ierr)
1337 end if
1338
1339 if (gs%dofmap%msh%neigh(dst)) then
1340 call mpi_isend(send_buf, nshared_unique, mpi_integer8, &
1341 dst, 0, neko_comm, send_req, ierr)
1342 end if
1343
1344 if (gs%dofmap%msh%neigh(src)) then
1345 call mpi_wait(recv_req, status, ierr)
1346 call mpi_get_count(status, mpi_integer8, n_recv, ierr)
1347
1348 do j = 1, n_recv
1349 shared_flg(j) = gs%shared_dofs%get(recv_buf(j), shared_gs_id)
1350 if (shared_flg(j) .eq. 0) then
1352 call gs%comm%recv_dof(src)%push(shared_gs_id)
1353 end if
1354 end do
1355
1356 if (gs%comm%recv_dof(src)%size() .gt. 0) then
1357 call recv_pe%push(src)
1358 end if
1359 end if
1360
1361 if (gs%dofmap%msh%neigh(dst)) then
1362 call mpi_wait(send_req, mpi_status_ignore, ierr)
1363 call mpi_irecv(recv_flg, max_recv, mpi_integer2, &
1364 dst, 0, neko_comm, recv_req, ierr)
1365 end if
1366
1367 if (gs%dofmap%msh%neigh(src)) then
1368 call mpi_isend(shared_flg, n_recv, mpi_integer2, &
1369 src, 0, neko_comm, send_req, ierr)
1370 end if
1371
1372 if (gs%dofmap%msh%neigh(dst)) then
1373 call mpi_wait(recv_req, status, ierr)
1374 call mpi_get_count(status, mpi_integer2, n_recv, ierr)
1375
1376 do j = 1, n_recv
1377 if (recv_flg(j) .eq. 0) then
1378 tmp = gs%shared_dofs%get(send_buf(j), shared_gs_id)
1380 call gs%comm%send_dof(dst)%push(shared_gs_id)
1381 end if
1382 end do
1383
1384 if (gs%comm%send_dof(dst)%size() .gt. 0) then
1385 call send_pe%push(dst)
1386 end if
1387 end if
1388
1389 if (gs%dofmap%msh%neigh(src)) then
1390 call mpi_wait(send_req, mpi_status_ignore, ierr)
1391 end if
1392
1393 end do
1394
1395 call gs%comm%init(send_pe, recv_pe)
1396
1397 call send_pe%free()
1398 call recv_pe%free()
1399
1400 deallocate(send_buf)
1401 deallocate(recv_flg)
1402 deallocate(shared_flg)
1403 !This arrays seems to take massive amounts of memory...
1404 call gs%shared_dofs%free()
1405
1406 end subroutine gs_schedule
1407
1409 subroutine gs_op_fld(gs, u, op, event)
1410 class(gs_t), intent(inout) :: gs
1411 type(field_t), intent(inout) :: u
1412 type(c_ptr), optional, intent(inout) :: event
1413 integer :: n, op
1414
1415 n = u%msh%nelv * u%Xh%lx * u%Xh%ly * u%Xh%lz
1416 if (present(event)) then
1417 call gs_op_vector(gs, u%x, n, op, event)
1418 else
1419 call gs_op_vector(gs, u%x, n, op)
1420 end if
1421
1422 end subroutine gs_op_fld
1423
1425 subroutine gs_op_r4(gs, u, n, op, event)
1426 class(gs_t), intent(inout) :: gs
1427 integer, intent(in) :: n
1428 real(kind=rp), contiguous, dimension(:,:,:,:), intent(inout) :: u
1429 type(c_ptr), optional, intent(inout) :: event
1430 integer :: op
1431
1432 if (present(event)) then
1433 call gs_op_vector(gs, u, n, op, event)
1434 else
1435 call gs_op_vector(gs, u, n, op)
1436 end if
1437
1438 end subroutine gs_op_r4
1439
1441 subroutine gs_op_vector(gs, u, n, op, event)
1442 class(gs_t), intent(inout) :: gs
1443 integer, intent(in) :: n
1444 real(kind=rp), dimension(n), intent(inout) :: u
1445 type(c_ptr), optional, intent(inout) :: event
1446 integer :: m, l, op, lo, so, tid
1447 type(c_ptr) :: scatter_event
1448
1449 lo = gs%local_facet_offset
1450 so = -gs%shared_facet_offset
1451 m = gs%nlocal
1452 l = gs%nshared
1453
1454 ! Capture the calling thread id before opening any parallel region; it
1455 ! is used as the MPI tag so concurrent gs ops driven from different
1456 ! threads (device path) don't collide.
1457 tid = 0
1458 !$ tid = omp_get_thread_num()
1459
1460 ! Resolve the optional event into a non-optional local before opening
1461 ! the parallel region. An absent optional dummy must not be captured by
1462 ! the region's data-sharing, otherwise the outlined region prologue
1463 ! dereferences a null descriptor (segfaults on CCE).
1464 scatter_event = c_null_ptr
1465 if (present(event)) scatter_event = event
1466
1467 !$omp parallel if (NEKO_BCKND_DEVICE .eq. 0)
1468 call profiler_start_region("gather_scatter", 5)
1469 ! Gather shared dofs
1470 if (pe_size .gt. 1 .and. n .gt. 0) then
1471 call profiler_start_region("gs_nbrecv", 13)
1472 call gs%comm%nbrecv(tid)
1473 call profiler_end_region("gs_nbrecv", 13)
1474 call profiler_start_region("gs_gather_shared", 14)
1475 call gs%bcknd%gather(gs%shared_gs, l, so, gs%shared_dof_gs, u, n, &
1476 gs%shared_gs_dof, gs%nshared_blks, gs%shared_blk_len, &
1477 gs%shared_blk_off, op, .true.)
1478 call profiler_end_region("gs_gather_shared", 14)
1479 call profiler_start_region("gs_nbsend", 6)
1480 call gs%comm%nbsend(gs%shared_gs, l, tid, &
1481 gs%bcknd%gather_event, gs%bcknd%gs_stream)
1482 call profiler_end_region("gs_nbsend", 6)
1483
1484 end if
1485
1486 ! Gather-scatter local dofs
1487 call profiler_start_region("gs_local", 12)
1488 call gs%bcknd%gather(gs%local_gs, m, lo, gs%local_dof_gs, u, n, &
1489 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, gs%local_blk_off, &
1490 op, .false.)
1491 call gs%bcknd%scatter(gs%local_gs, m, gs%local_dof_gs, u, n, &
1492 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, gs%local_blk_off, &
1493 .false., c_null_ptr)
1494 call profiler_end_region("gs_local", 12)
1495 ! Scatter shared dofs
1496 if (pe_size .gt. 1 .and. n .gt. 0) then
1497 call profiler_start_region("gs_nbwait", 7)
1498 call gs%comm%nbwait(gs%shared_gs, l, op, gs%bcknd%gs_stream)
1499 call profiler_end_region("gs_nbwait", 7)
1500 call profiler_start_region("gs_scatter_shared", 15)
1501 call gs%bcknd%scatter(gs%shared_gs, l,&
1502 gs%shared_dof_gs, u, n, &
1503 gs%shared_gs_dof, gs%nshared_blks, &
1504 gs%shared_blk_len, gs%shared_blk_off, .true., scatter_event)
1505 call profiler_end_region("gs_scatter_shared", 15)
1506 end if
1507
1508 call profiler_end_region("gather_scatter", 5)
1509 !$omp end parallel
1510 end subroutine gs_op_vector
1511
1512end module gather_scatter
recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
Sort the dof lists based on the dof to gather-scatter list.
subroutine gs_find_blks(dg, blk_len, blk_off, nblks, n, m)
Find blocks sharing dofs in non-facet data.
integer function gs_mapping_add_dof(map_, dof, max_id)
Register a unique dof Takes the unique id dof and checks if it is in the htable map_ If it is we retu...
Map a Fortran array to a device (allocate and associate)
Definition device.F90:78
Copy data between host and device (or device and device)
Definition device.F90:72
Synchronize a device or stream.
Definition device.F90:114
Unmap a Fortran array from a device (deassociate and free)
Definition device.F90:84
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:61
integer, public pe_rank
MPI rank.
Definition comm.F90:58
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:45
Device abstraction, common interface for various accelerators.
Definition device.F90:34
integer, parameter, public host_to_device
Definition device.F90:48
Defines a mapping of the degrees of freedom.
Definition dofmap.f90:35
Defines a field.
Definition field.f90:34
Gather-scatter.
subroutine gs_init_mapping(gs)
Setup mapping of dofs to gather-scatter operations.
subroutine gs_schedule(gs)
Schedule shared gather-scatter operations.
subroutine gs_free(gs)
Deallocate a gather-scatter kernel.
subroutine gs_op_r4(gs, u, n, op, event)
Gather-scatter operation on a rank 4 array.
subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
Initialize a gather-scatter kernel.
subroutine gs_op_vector(gs, u, n, op, event)
Gather-scatter operation on a vector u with op op.
subroutine gs_op_fld(gs, u, op, event)
Gather-scatter operation on a field u with op op.
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
integer, parameter, public gs_bcknd_cpu
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_sx
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_dev
Definition gs_bcknd.f90:40
Defines Coarray Fortran gather-scatter communication.
Definition gs_caf.F90:34
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
integer, parameter, public gs_comm_mpigpu
Definition gs_comm.f90:42
integer, parameter, public gs_comm_mpi
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nvshmem
Definition gs_comm.f90:42
integer, parameter, public gs_comm_openshmem
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nccl
Definition gs_comm.f90:42
integer, parameter, public gs_comm_caf
Definition gs_comm.f90:42
Generic Gather-scatter backend for CPUs.
Definition gs_cpu.f90:34
Defines GPU aware MPI gather-scatter communication.
Defines NCCL based gather-scatter communication.
Defines GPU aware MPI gather-scatter communication.
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
Defines MPI gather-scatter communication.
Definition gs_mpi.f90:34
Defines Gather-scatter operations.
Definition gs_ops.f90:34
integer, parameter, public gs_op_add
Definition gs_ops.f90:36
integer, parameter, public gs_op_max
Definition gs_ops.f90:36
integer, parameter, public gs_op_min
Definition gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition gs_ops.f90:36
Defines OpenSHMEM gather-scatter communication.
Definition gs_shmem.F90:34
Generic Gather-scatter backend for NEC Vector Engines.
Definition gs_sx.f90:34
Implements a hash table ADT.
Definition htable.f90:52
Logging routines.
Definition log.f90:34
type(log_t), public neko_log
Global log stream.
Definition log.f90:80
integer, parameter, public log_size
Definition log.f90:46
Defines a mesh.
Definition mesh.f90:34
Build configurations.
integer, parameter neko_bcknd_sx
integer, parameter neko_bcknd_hip
integer, parameter neko_bcknd_device
integer, parameter neko_bcknd_opencl
logical, parameter neko_device_mpi
integer, parameter neko_bcknd_cuda
integer, parameter neko_bcknd_metal
integer, parameter, public i2
Definition num_types.f90:5
integer, parameter, public i8
Definition num_types.f90:7
integer, parameter, public dp
Definition num_types.f90:9
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Profiling interface.
Definition profiler.F90:34
subroutine, public profiler_start_region(name, region_id)
Started a named (name) profiler region.
Definition profiler.F90:79
subroutine, public profiler_end_region(name, region_id)
End the most recently started profiler region.
Definition profiler.F90:116
Implements a dynamic stack ADT.
Definition stack.f90:49
Utilities.
Definition utils.f90:35
pure integer function, public linear_index(i, j, k, l, lx, ly, lz)
Compute the address of a (i,j,k,l) array with sizes (1:lx, 1:ly, 1:lz, :)
Definition utils.f90:289
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter communication using Coarray Fortran (F2008). Each image puts directly into the (module...
Definition gs_caf.F90:129
Gather-scatter communication method.
Definition gs_comm.f90:47
Gather-scatter backend for CPUs.
Definition gs_cpu.f90:43
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Gather-scatter communication using MPI.
Definition gs_mpi.f90:49
Gather-scatter communication using OpenSHMEM one-sided puts with per-rank signaling for completion (O...
Definition gs_shmem.F90:91
Gather-scatter backend for NEC SX-Aurora.
Definition gs_sx.f90:43
Integer*8 based hash table.
Definition htable.f90:112
Iterator for an integer*8 based hash table.
Definition htable.f90:195
Integer based stack.
Definition stack.f90:77