Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gather_scatter.f90
Go to the documentation of this file.
1! Copyright (c) 2020-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
38 use gs_device, only : gs_device_t
39 use gs_sx, only : gs_sx_t
40 use gs_cpu, only : gs_cpu_t
44 use gs_mpi, only : gs_mpi_t
48 use mesh, only : mesh_t
49 use comm, only : pe_rank, pe_size, neko_comm
50 use mpi_f08, only : mpi_reduce, mpi_allreduce, mpi_barrier, mpi_in_place, &
51 mpi_wait, mpi_irecv, mpi_isend, mpi_wtime, mpi_sum, mpi_max, &
52 mpi_integer, mpi_integer2, mpi_integer8, mpi_request, mpi_status, &
53 mpi_status_ignore
54 use dofmap, only : dofmap_t
55 use field, only : field_t
56 use num_types, only : rp, dp, i2, i8
58 use stack, only : stack_i4_t
59 use utils, only : neko_error, linear_index
60 use logger, only : neko_log, log_size
64 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr
65 implicit none
66 private
67
68 type, public :: gs_t
69 real(kind=rp), allocatable :: local_gs(:)
70 integer, allocatable :: local_dof_gs(:)
71 integer, allocatable :: local_gs_dof(:)
72 integer, allocatable :: local_blk_len(:)
73 real(kind=rp), allocatable :: shared_gs(:)
74 integer, allocatable :: shared_dof_gs(:)
75 integer, allocatable :: shared_gs_dof(:)
76 integer, allocatable :: shared_blk_len(:)
77 type(dofmap_t), pointer ::dofmap
78 type(htable_i8_t) :: shared_dofs
79 integer :: nlocal
80 integer :: nshared
81 integer :: nlocal_blks
82 integer :: nshared_blks
83 integer :: local_facet_offset
84 integer :: shared_facet_offset
85 class(gs_bcknd_t), allocatable :: bcknd
86 class(gs_comm_t), allocatable :: comm
87 contains
88 procedure, private, pass(gs) :: gs_op_fld
89 procedure, private, pass(gs) :: gs_op_r4
90 procedure, pass(gs) :: gs_op_vector
91 procedure, pass(gs) :: init => gs_init
92 procedure, pass(gs) :: free => gs_free
93 generic :: op => gs_op_fld, gs_op_r4, gs_op_vector
94 end type gs_t
95
96 ! Expose available gather-scatter operation
98
99 ! Expose available gather-scatter backends
101
102 ! Expose available gather-scatter comm. backends
104
105
106contains
107
112 subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
113 class(gs_t), intent(inout) :: gs
114 type(dofmap_t), target, intent(inout) :: dofmap
115 character(len=LOG_SIZE) :: log_buf
116 character(len=20) :: bcknd_str
117 integer, optional :: bcknd, comm_bcknd
118 integer :: i, j, ierr, bcknd_, comm_bcknd_
119 integer(i8) :: glb_nshared, glb_nlocal
120 logical :: use_device_mpi, use_device_nccl, use_device_shmem, use_host_mpi
121 real(kind=rp), allocatable :: tmp(:)
122 type(c_ptr) :: tmp_d = c_null_ptr
123 integer :: strtgy(4) = (/ int(b'00'), int(b'01'), int(b'10'), int(b'11') /)
124 integer :: avg_strtgy, env_len
125 character(len=255) :: env_strtgy, env_gscomm
126 real(kind=dp) :: strtgy_time(4)
127
128 call gs%free()
129
130 call neko_log%section('Gather-Scatter')
131 ! Currently this uses the dofmap which also contains geometric information
132 ! Only connectivity/numbering of points is technically necessary for gs
133 gs%dofmap => dofmap
134
135 use_device_mpi = .false.
136 use_device_nccl = .false.
137 use_device_shmem = .false.
138 use_host_mpi = .false.
139 ! Check if a comm-backend is requested via env. variables
140 call get_environment_variable("NEKO_GS_COMM", env_gscomm, env_len)
141 if (env_len .gt. 0) then
142 if (env_gscomm(1:env_len) .eq. "MPI") then
143 use_host_mpi = .true.
144 else if (env_gscomm(1:env_len) .eq. "MPIGPU") then
145 use_device_mpi = .true.
146 else if (env_gscomm(1:env_len) .eq. "NCCL") then
147 use_device_nccl = .true.
148 else if (env_gscomm(1:env_len) .eq. "SHMEM") then
149 use_device_shmem = .true.
150 else
151 call neko_error('Unknown Gather-scatter comm. backend')
152 end if
153 end if
154
155
156 if (present(comm_bcknd)) then
157 comm_bcknd_ = comm_bcknd
158 else if (use_host_mpi) then
159 comm_bcknd_ = gs_comm_mpi
160 else if (use_device_mpi) then
161 comm_bcknd_ = gs_comm_mpigpu
162 else if (use_device_nccl) then
163 comm_bcknd_ = gs_comm_nccl
164 else if (use_device_shmem) then
165 comm_bcknd_ = gs_comm_nvshmem
166 else
167 if (neko_device_mpi) then
168 comm_bcknd_ = gs_comm_mpigpu
169 use_device_mpi = .true.
170 else
171 comm_bcknd_ = gs_comm_mpi
172 end if
173 end if
174
175 select case (comm_bcknd_)
176 case (gs_comm_mpi)
177 call neko_log%message('Comm : MPI')
178 allocate(gs_mpi_t::gs%comm)
179 case (gs_comm_mpigpu)
180 call neko_log%message('Comm : Device MPI')
181 allocate(gs_device_mpi_t::gs%comm)
182 case (gs_comm_nccl)
183 call neko_log%message('Comm : NCCL')
184 allocate(gs_device_nccl_t::gs%comm)
185 case (gs_comm_nvshmem)
186 call neko_log%message('Comm : NVSHMEM')
187 allocate(gs_device_shmem_t::gs%comm)
188 case default
189 call neko_error('Unknown Gather-scatter comm. backend')
190 end select
191 ! Initialize a stack for each rank containing which dofs to send/recv at
192 ! that rank
193 call gs%comm%init_dofs()
194 ! Initialize mapping between local ids and gather-scatter ids
195 ! based on the global numbering in dofmap
196 call gs_init_mapping(gs)
197 ! Setup buffers and which ranks to send/recv data from based on mapping
198 ! and initializes gs%comm (sets up gs%comm%send_dof and gs%comm%recv_dof and
199 ! recv_pe/send_pe)
200 call gs_schedule(gs)
201 ! Global number of points not needing to be sent over mpi for gs operations
202 ! "Internal points"
203 glb_nlocal = int(gs%nlocal, i8)
204 ! Global number of points needing to be communicated with other pes/ranks
205 ! "external points"
206 glb_nshared = int(gs%nshared, i8)
207 ! Can be thought of a measure of the volume of this rank (glb_nlocal) and
208 ! the surface area (glb_nshared) that is shared with other ranks
209 ! Lots of internal volume compared to surface that needs communication is
210 ! good
211
212 if (pe_rank .eq. 0) then
213 call mpi_reduce(mpi_in_place, glb_nlocal, 1, &
214 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
215
216 call mpi_reduce(mpi_in_place, glb_nshared, 1, &
217 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
218 else
219 call mpi_reduce(glb_nlocal, glb_nlocal, 1, &
220 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
221
222 call mpi_reduce(glb_nshared, glb_nshared, 1, &
223 mpi_integer8, mpi_sum, 0, neko_comm, ierr)
224 end if
225
226 write(log_buf, '(A,I12)') 'Avg. internal: ', glb_nlocal/pe_size
227 call neko_log%message(log_buf)
228 write(log_buf, '(A,I12)') 'Avg. external: ', glb_nshared/pe_size
229 call neko_log%message(log_buf)
230
231 if (present(bcknd)) then
232 bcknd_ = bcknd
233 else
234 if (neko_bcknd_sx .eq. 1) then
235 bcknd_ = gs_bcknd_sx
236 else if (neko_bcknd_device .eq. 1) then
237 bcknd_ = gs_bcknd_dev
238 else
239 bcknd_ = gs_bcknd_cpu
240 end if
241 end if
242
243 ! Setup Gather-scatter backend
244 select case(bcknd_)
245 case(gs_bcknd_cpu)
246 allocate(gs_cpu_t::gs%bcknd)
247 bcknd_str = ' std'
248 case(gs_bcknd_dev)
249 allocate(gs_device_t::gs%bcknd)
250 if (neko_bcknd_hip .eq. 1) then
251 bcknd_str = ' hip'
252 else if (neko_bcknd_cuda .eq. 1) then
253 bcknd_str = ' cuda'
254 else if (neko_bcknd_opencl .eq. 1) then
255 bcknd_str = ' opencl'
256 end if
257 case(gs_bcknd_sx)
258 allocate(gs_sx_t::gs%bcknd)
259 bcknd_str = ' sx'
260 case default
261 call neko_error('Unknown Gather-scatter backend')
262 end select
263
264 write(log_buf, '(A)') 'Backend : ' // trim(bcknd_str)
265 call neko_log%message(log_buf)
266
267
268
269 call gs%bcknd%init(gs%nlocal, gs%nshared, gs%nlocal_blks, gs%nshared_blks)
270
271 if (use_device_mpi .or. use_device_nccl .or. use_device_shmem) then
272 select type(b => gs%bcknd)
273 type is (gs_device_t)
274 b%shared_on_host = .false.
275 end select
276 end if
277
278 if (use_device_mpi) then
279 if(pe_size .gt. 1) then
280 ! Select fastest device MPI strategy at runtime
281 select type(c => gs%comm)
282 type is (gs_device_mpi_t)
283 call get_environment_variable("NEKO_GS_STRTGY", env_strtgy, env_len)
284 if (env_len .eq. 0) then
285 allocate(tmp(dofmap%size()))
286 call device_map(tmp, tmp_d, dofmap%size())
287 tmp = 1.0_rp
288 call device_memcpy(tmp, tmp_d, dofmap%size(), &
289 host_to_device, sync=.false.)
290 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
291
292 do i = 1, size(strtgy)
293 c%nb_strtgy = strtgy(i)
294 call device_sync
295 call mpi_barrier(neko_comm)
296 strtgy_time(i) = mpi_wtime()
297 do j = 1, 100
298 call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
299 end do
300 strtgy_time(i) = (mpi_wtime() - strtgy_time(i)) / 100d0
301 end do
302
303 call device_deassociate(tmp)
304 call device_free(tmp_d)
305 deallocate(tmp)
306
307 c%nb_strtgy = strtgy(minloc(strtgy_time, 1))
308
309 avg_strtgy = minloc(strtgy_time, 1)
310 call mpi_allreduce(mpi_in_place, avg_strtgy, 1, &
311 mpi_integer, mpi_sum, neko_comm)
312 avg_strtgy = avg_strtgy / pe_size
313
314 write(log_buf, '(A,B0.2,A)') 'Avg. strtgy : [', &
315 strtgy(avg_strtgy),']'
316
317 else
318 read(env_strtgy(1:env_len), *) i
319
320 if (i .lt. 1 .or. i .gt. 4) then
321 call neko_error('Invalid gs sync strtgy')
322 end if
323
324 c%nb_strtgy = strtgy(i)
325 avg_strtgy = i
326
327 write(log_buf, '(A,B0.2,A)') 'Env. strtgy : [', &
328 strtgy(avg_strtgy),']'
329 end if
330
331 call neko_log%message(log_buf)
332
333 end select
334 end if
335 end if
336
337 call neko_log%end_section()
338
339 end subroutine gs_init
340
342 subroutine gs_free(gs)
343 class(gs_t), intent(inout) :: gs
344
345 nullify(gs%dofmap)
346
347 if (allocated(gs%local_gs)) then
348 deallocate(gs%local_gs)
349 end if
350
351 if (allocated(gs%local_dof_gs)) then
352 deallocate(gs%local_dof_gs)
353 end if
354
355 if (allocated(gs%local_gs_dof)) then
356 deallocate(gs%local_gs_dof)
357 end if
358
359 if (allocated(gs%local_blk_len)) then
360 deallocate(gs%local_blk_len)
361 end if
362
363 if (allocated(gs%shared_gs)) then
364 deallocate(gs%shared_gs)
365 end if
366
367 if (allocated(gs%shared_dof_gs)) then
368 deallocate(gs%shared_dof_gs)
369 end if
370
371 if (allocated(gs%shared_gs_dof)) then
372 deallocate(gs%shared_gs_dof)
373 end if
374
375 if (allocated(gs%shared_blk_len)) then
376 deallocate(gs%shared_blk_len)
377 end if
378
379 gs%nlocal = 0
380 gs%nshared = 0
381 gs%nlocal_blks = 0
382 gs%nshared_blks = 0
383
384 call gs%shared_dofs%free()
385
386 if (allocated(gs%bcknd)) then
387 call gs%bcknd%free()
388 deallocate(gs%bcknd)
389 end if
390
391 if (allocated(gs%comm)) then
392 call gs%comm%free()
393 deallocate(gs%comm)
394 end if
395
396 end subroutine gs_free
397
399 subroutine gs_init_mapping(gs)
400 type(gs_t), target, intent(inout) :: gs
401 type(mesh_t), pointer :: msh
402 type(dofmap_t), pointer :: dofmap
403 type(stack_i4_t), target :: local_dof, dof_local, shared_dof, dof_shared
404 type(stack_i4_t), target :: local_face_dof, face_dof_local
405 type(stack_i4_t), target :: shared_face_dof, face_dof_shared
406 integer :: i, j, k, l, lx, ly, lz, max_id, max_sid, id, lid, dm_size
407 type(htable_i8_t) :: dm
408 type(htable_i8_t), pointer :: sdm
409
410 dofmap => gs%dofmap
411 msh => dofmap%msh
412 sdm => gs%shared_dofs
413
414 lx = dofmap%Xh%lx
415 ly = dofmap%Xh%ly
416 lz = dofmap%Xh%lz
417 dm_size = dofmap%size()/lx
418
419 call dm%init(dm_size, i)
423 call sdm%init(dofmap%size(), i)
424
425
426 call local_dof%init()
427 call dof_local%init()
428
429 call local_face_dof%init()
430 call face_dof_local%init()
431
432 call shared_dof%init()
433 call dof_shared%init()
434
435 call shared_face_dof%init()
436 call face_dof_shared%init()
437
438 !
439 ! Setup mapping for dofs points
440 !
441
442 max_id = 0
443 max_sid = 0
444 do i = 1, msh%nelv
445 ! Local id of vertices
446 lid = linear_index(1, 1, 1, i, lx, ly, lz)
447 ! Check if this dof is shared among ranks or not
448 if (dofmap%shared_dof(1, 1, 1, i)) then
449 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, 1, i), max_sid)
450 !If add unique gather-scatter id to shared_dof stack
451 call shared_dof%push(id)
452 !If add local id to dof_shared stack
453 call dof_shared%push(lid)
454 !Now we have the mapping of local id <-> gather scatter id!
455 else
456 ! Same here, only here we know the point is local
457 ! It will as such not need to be sent to other ranks later
458 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, 1, i), max_id)
459 call local_dof%push(id)
460 call dof_local%push(lid)
461 end if
462 ! This procedure is then repeated for all vertices and edges
463 ! Facets can be treated a little bit differently since they only have one
464 ! neighbor
465
466 lid = linear_index(lx, 1, 1, i, lx, ly, lz)
467 if (dofmap%shared_dof(lx, 1, 1, i)) then
468 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, 1, i), max_sid)
469 call shared_dof%push(id)
470 call dof_shared%push(lid)
471 else
472 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, 1, i), max_id)
473 call local_dof%push(id)
474 call dof_local%push(lid)
475 end if
476
477 lid = linear_index(1, ly, 1, i, lx, ly, lz)
478 if (dofmap%shared_dof(1, ly, 1, i)) then
479 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, 1, i), max_sid)
480 call shared_dof%push(id)
481 call dof_shared%push(lid)
482 else
483 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, 1, i), max_id)
484 call local_dof%push(id)
485 call dof_local%push(lid)
486 end if
487
488 lid = linear_index(lx, ly, 1, i, lx, ly, lz)
489 if (dofmap%shared_dof(lx, ly, 1, i)) then
490 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, 1, i), max_sid)
491 call shared_dof%push(id)
492 call dof_shared%push(lid)
493 else
494 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, 1, i), max_id)
495 call local_dof%push(id)
496 call dof_local%push(lid)
497 end if
498 if (lz .gt. 1) then
499 lid = linear_index(1, 1, lz, i, lx, ly, lz)
500 if (dofmap%shared_dof(1, 1, lz, i)) then
501 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, lz, i), max_sid)
502 call shared_dof%push(id)
503 call dof_shared%push(lid)
504 else
505 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, lz, i), max_id)
506 call local_dof%push(id)
507 call dof_local%push(lid)
508 end if
509
510 lid = linear_index(lx, 1, lz, i, lx, ly, lz)
511 if (dofmap%shared_dof(lx, 1, lz, i)) then
512 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, lz, i), max_sid)
513 call shared_dof%push(id)
514 call dof_shared%push(lid)
515 else
516 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, lz, i), max_id)
517 call local_dof%push(id)
518 call dof_local%push(lid)
519 end if
520
521 lid = linear_index(1, ly, lz, i, lx, ly, lz)
522 if (dofmap%shared_dof(1, ly, lz, i)) then
523 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, lz, i), max_sid)
524 call shared_dof%push(id)
525 call dof_shared%push(lid)
526 else
527 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, lz, i), max_id)
528 call local_dof%push(id)
529 call dof_local%push(lid)
530 end if
531
532 lid = linear_index(lx, ly, lz, i, lx, ly, lz)
533 if (dofmap%shared_dof(lx, ly, lz, i)) then
534 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, lz, i), max_sid)
535 call shared_dof%push(id)
536 call dof_shared%push(lid)
537 else
538 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, lz, i), max_id)
539 call local_dof%push(id)
540 call dof_local%push(lid)
541 end if
542 end if
543 end do
544
545 ! Clear local dofmap table
546 call dm%clear()
547 ! Get gather scatter ids and local ids of edges
548 if (lz .gt. 1) then
549 !
550 ! Setup mapping for dofs on edges
551 !
552 do i = 1, msh%nelv
553
554 !
555 ! dofs on edges in x-direction
556 !
557 if (dofmap%shared_dof(2, 1, 1, i)) then
558 do j = 2, lx - 1
559 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
560 call shared_dof%push(id)
561 id = linear_index(j, 1, 1, i, lx, ly, lz)
562 call dof_shared%push(id)
563 end do
564 else
565 do j = 2, lx - 1
566 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
567 call local_dof%push(id)
568 id = linear_index(j, 1, 1, i, lx, ly, lz)
569 call dof_local%push(id)
570 end do
571 end if
572 if (dofmap%shared_dof(2, 1, lz, i)) then
573 do j = 2, lx - 1
574 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, lz, i), max_sid)
575 call shared_dof%push(id)
576 id = linear_index(j, 1, lz, i, lx, ly, lz)
577 call dof_shared%push(id)
578 end do
579 else
580 do j = 2, lx - 1
581 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, lz, i), max_id)
582 call local_dof%push(id)
583 id = linear_index(j, 1, lz, i, lx, ly, lz)
584 call dof_local%push(id)
585 end do
586 end if
587
588 if (dofmap%shared_dof(2, ly, 1, i)) then
589 do j = 2, lx - 1
590 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
591 call shared_dof%push(id)
592 id = linear_index(j, ly, 1, i, lx, ly, lz)
593 call dof_shared%push(id)
594 end do
595
596 else
597 do j = 2, lx - 1
598 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
599 call local_dof%push(id)
600 id = linear_index(j, ly, 1, i, lx, ly, lz)
601 call dof_local%push(id)
602 end do
603 end if
604 if (dofmap%shared_dof(2, ly, lz, i)) then
605 do j = 2, lx - 1
606 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, lz, i), max_sid)
607 call shared_dof%push(id)
608 id = linear_index(j, ly, lz, i, lx, ly, lz)
609 call dof_shared%push(id)
610 end do
611 else
612 do j = 2, lx - 1
613 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, lz, i), max_id)
614 call local_dof%push(id)
615 id = linear_index(j, ly, lz, i, lx, ly, lz)
616 call dof_local%push(id)
617 end do
618 end if
619
620 !
621 ! dofs on edges in y-direction
622 !
623 if (dofmap%shared_dof(1, 2, 1, i)) then
624 do k = 2, ly - 1
625 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
626 call shared_dof%push(id)
627 id = linear_index(1, k, 1, i, lx, ly, lz)
628 call dof_shared%push(id)
629 end do
630 else
631 do k = 2, ly - 1
632 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
633 call local_dof%push(id)
634 id = linear_index(1, k, 1, i, lx, ly, lz)
635 call dof_local%push(id)
636 end do
637 end if
638 if (dofmap%shared_dof(1, 2, lz, i)) then
639 do k = 2, ly - 1
640 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, lz, i), max_sid)
641 call shared_dof%push(id)
642 id = linear_index(1, k, lz, i, lx, ly, lz)
643 call dof_shared%push(id)
644 end do
645 else
646 do k = 2, ly - 1
647 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, lz, i), max_id)
648 call local_dof%push(id)
649 id = linear_index(1, k, lz, i, lx, ly, lz)
650 call dof_local%push(id)
651 end do
652 end if
653
654 if (dofmap%shared_dof(lx, 2, 1, i)) then
655 do k = 2, ly - 1
656 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
657 call shared_dof%push(id)
658 id = linear_index(lx, k, 1, i, lx, ly, lz)
659 call dof_shared%push(id)
660 end do
661 else
662 do k = 2, ly - 1
663 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
664 call local_dof%push(id)
665 id = linear_index(lx, k, 1, i, lx, ly, lz)
666 call dof_local%push(id)
667 end do
668 end if
669 if (dofmap%shared_dof(lx, 2, lz, i)) then
670 do k = 2, ly - 1
671 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, lz, i), max_sid)
672 call shared_dof%push(id)
673 id = linear_index(lx, k, lz, i, lx, ly, lz)
674 call dof_shared%push(id)
675 end do
676 else
677 do k = 2, ly - 1
678 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, lz, i), max_id)
679 call local_dof%push(id)
680 id = linear_index(lx, k, lz, i, lx, ly, lz)
681 call dof_local%push(id)
682 end do
683 end if
684 !
685 ! dofs on edges in z-direction
686 !
687 if (dofmap%shared_dof(1, 1, 2, i)) then
688 do l = 2, lz - 1
689 id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, l, i), max_sid)
690 call shared_dof%push(id)
691 id = linear_index(1, 1, l, i, lx, ly, lz)
692 call dof_shared%push(id)
693 end do
694 else
695 do l = 2, lz - 1
696 id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, l, i), max_id)
697 call local_dof%push(id)
698 id = linear_index(1, 1, l, i, lx, ly, lz)
699 call dof_local%push(id)
700 end do
701 end if
702
703 if (dofmap%shared_dof(lx, 1, 2, i)) then
704 do l = 2, lz - 1
705 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, l, i), max_sid)
706 call shared_dof%push(id)
707 id = linear_index(lx, 1, l, i, lx, ly, lz)
708 call dof_shared%push(id)
709 end do
710 else
711 do l = 2, lz - 1
712 id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, l, i), max_id)
713 call local_dof%push(id)
714 id = linear_index(lx, 1, l, i, lx, ly, lz)
715 call dof_local%push(id)
716 end do
717 end if
718
719 if (dofmap%shared_dof(1, ly, 2, i)) then
720 do l = 2, lz - 1
721 id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, l, i), max_sid)
722 call shared_dof%push(id)
723 id = linear_index(1, ly, l, i, lx, ly, lz)
724 call dof_shared%push(id)
725 end do
726 else
727 do l = 2, lz - 1
728 id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, l, i), max_id)
729 call local_dof%push(id)
730 id = linear_index(1, ly, l, i, lx, ly, lz)
731 call dof_local%push(id)
732 end do
733 end if
734
735 if (dofmap%shared_dof(lx, ly, 2, i)) then
736 do l = 2, lz - 1
737 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, l, i), max_sid)
738 call shared_dof%push(id)
739 id = linear_index(lx, ly, l, i, lx, ly, lz)
740 call dof_shared%push(id)
741 end do
742 else
743 do l = 2, lz - 1
744 id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, l, i), max_id)
745 call local_dof%push(id)
746 id = linear_index(lx, ly, l, i, lx, ly, lz)
747 call dof_local%push(id)
748 end do
749 end if
750 end do
751 end if
752
753 ! Clear local dofmap table
754 call dm%clear()
755
756 !
757 ! Setup mapping for dofs on facets
758 !
759 ! This is for 2d
760 if (lz .eq. 1) then
761 do i = 1, msh%nelv
762
763 !
764 ! dofs on edges in x-direction
765 !
766 if (msh%facet_neigh(3, i) .ne. 0) then
767 if (dofmap%shared_dof(2, 1, 1, i)) then
768 do j = 2, lx - 1
769 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
770 call shared_face_dof%push(id)
771 id = linear_index(j, 1, 1, i, lx, ly, lz)
772 call face_dof_shared%push(id)
773 end do
774 else
775 do j = 2, lx - 1
776 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
777 call local_face_dof%push(id)
778 id = linear_index(j, 1, 1, i, lx, ly, lz)
779 call face_dof_local%push(id)
780 end do
781 end if
782 end if
783
784 if (msh%facet_neigh(4, i) .ne. 0) then
785 if (dofmap%shared_dof(2, ly, 1, i)) then
786 do j = 2, lx - 1
787 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
788 call shared_face_dof%push(id)
789 id = linear_index(j, ly, 1, i, lx, ly, lz)
790 call face_dof_shared%push(id)
791 end do
792
793 else
794 do j = 2, lx - 1
795 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
796 call local_face_dof%push(id)
797 id = linear_index(j, ly, 1, i, lx, ly, lz)
798 call face_dof_local%push(id)
799 end do
800 end if
801 end if
802
803 !
804 ! dofs on edges in y-direction
805 !
806 if (msh%facet_neigh(1, i) .ne. 0) then
807 if (dofmap%shared_dof(1, 2, 1, i)) then
808 do k = 2, ly - 1
809 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
810 call shared_face_dof%push(id)
811 id = linear_index(1, k, 1, i, lx, ly, lz)
812 call face_dof_shared%push(id)
813 end do
814 else
815 do k = 2, ly - 1
816 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
817 call local_face_dof%push(id)
818 id = linear_index(1, k, 1, i, lx, ly, lz)
819 call face_dof_local%push(id)
820 end do
821 end if
822 end if
823
824 if (msh%facet_neigh(2, i) .ne. 0) then
825 if (dofmap%shared_dof(lx, 2, 1, i)) then
826 do k = 2, ly - 1
827 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
828 call shared_face_dof%push(id)
829 id = linear_index(lx, k, 1, i, lx, ly, lz)
830 call face_dof_shared%push(id)
831 end do
832 else
833 do k = 2, ly - 1
834 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
835 call local_face_dof%push(id)
836 id = linear_index(lx, k, 1, i, lx, ly, lz)
837 call face_dof_local%push(id)
838 end do
839 end if
840 end if
841 end do
842 else
843 do i = 1, msh%nelv
844
845 ! Facets in x-direction (s, t)-plane
846 if (msh%facet_neigh(1, i) .ne. 0) then
847 if (dofmap%shared_dof(1, 2, 2, i)) then
848 do l = 2, lz - 1
849 do k = 2, ly - 1
850 id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, l, i), max_sid)
851 call shared_face_dof%push(id)
852 id = linear_index(1, k, l, i, lx, ly, lz)
853 call face_dof_shared%push(id)
854 end do
855 end do
856 else
857 do l = 2, lz - 1
858 do k = 2, ly - 1
859 id = gs_mapping_add_dof(dm, dofmap%dof(1, k, l, i), max_id)
860 call local_face_dof%push(id)
861 id = linear_index(1, k, l, i, lx, ly, lz)
862 call face_dof_local%push(id)
863 end do
864 end do
865 end if
866 end if
867
868 if (msh%facet_neigh(2, i) .ne. 0) then
869 if (dofmap%shared_dof(lx, 2, 2, i)) then
870 do l = 2, lz - 1
871 do k = 2, ly - 1
872 id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, l, i), max_sid)
873 call shared_face_dof%push(id)
874 id = linear_index(lx, k, l, i, lx, ly, lz)
875 call face_dof_shared%push(id)
876 end do
877 end do
878 else
879 do l = 2, lz - 1
880 do k = 2, ly - 1
881 id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, l, i), max_id)
882 call local_face_dof%push(id)
883 id = linear_index(lx, k, l, i, lx, ly, lz)
884 call face_dof_local%push(id)
885 end do
886 end do
887 end if
888 end if
889
890 ! Facets in y-direction (r, t)-plane
891 if (msh%facet_neigh(3, i) .ne. 0) then
892 if (dofmap%shared_dof(2, 1, 2, i)) then
893 do l = 2, lz - 1
894 do j = 2, lx - 1
895 id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, l, i), max_sid)
896 call shared_face_dof%push(id)
897 id = linear_index(j, 1, l, i, lx, ly, lz)
898 call face_dof_shared%push(id)
899 end do
900 end do
901 else
902 do l = 2, lz - 1
903 do j = 2, lx - 1
904 id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, l, i), max_id)
905 call local_face_dof%push(id)
906 id = linear_index(j, 1, l, i, lx, ly, lz)
907 call face_dof_local%push(id)
908 end do
909 end do
910 end if
911 end if
912
913 if (msh%facet_neigh(4, i) .ne. 0) then
914 if (dofmap%shared_dof(2, ly, 2, i)) then
915 do l = 2, lz - 1
916 do j = 2, lx - 1
917 id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, l, i), max_sid)
918 call shared_face_dof%push(id)
919 id = linear_index(j, ly, l, i, lx, ly, lz)
920 call face_dof_shared%push(id)
921 end do
922 end do
923 else
924 do l = 2, lz - 1
925 do j = 2, lx - 1
926 id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, l, i), max_id)
927 call local_face_dof%push(id)
928 id = linear_index(j, ly, l, i, lx, ly, lz)
929 call face_dof_local%push(id)
930 end do
931 end do
932 end if
933 end if
934
935 ! Facets in z-direction (r, s)-plane
936 if (msh%facet_neigh(5, i) .ne. 0) then
937 if (dofmap%shared_dof(2, 2, 1, i)) then
938 do k = 2, ly - 1
939 do j = 2, lx - 1
940 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, 1, i), max_sid)
941 call shared_face_dof%push(id)
942 id = linear_index(j, k, 1, i, lx, ly, lz)
943 call face_dof_shared%push(id)
944 end do
945 end do
946 else
947 do k = 2, ly - 1
948 do j = 2, lx - 1
949 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, 1, i), max_id)
950 call local_face_dof%push(id)
951 id = linear_index(j, k, 1, i, lx, ly, lz)
952 call face_dof_local%push(id)
953 end do
954 end do
955 end if
956 end if
957
958 if (msh%facet_neigh(6, i) .ne. 0) then
959 if (dofmap%shared_dof(2, 2, lz, i)) then
960 do k = 2, ly - 1
961 do j = 2, lx - 1
962 id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, lz, i), max_sid)
963 call shared_face_dof%push(id)
964 id = linear_index(j, k, lz, i, lx, ly, lz)
965 call face_dof_shared%push(id)
966 end do
967 end do
968 else
969 do k = 2, ly - 1
970 do j = 2, lx - 1
971 id = gs_mapping_add_dof(dm, dofmap%dof(j, k, lz, i), max_id)
972 call local_face_dof%push(id)
973 id = linear_index(j, k, lz, i, lx, ly, lz)
974 call face_dof_local%push(id)
975 end do
976 end do
977 end if
978 end if
979 end do
980 end if
981
982
983 call dm%free()
984
985 gs%nlocal = local_dof%size() + local_face_dof%size()
986 gs%local_facet_offset = local_dof%size() + 1
987
988 ! Finalize local dof to gather-scatter index
989 allocate(gs%local_dof_gs(gs%nlocal))
990
991 ! Add dofs on points and edges
992
993 ! We should use the %array() procedure, which works great for
994 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
995 ! certain data types
996 select type(dof_array => local_dof%data)
997 type is (integer)
998 j = local_dof%size()
999 do i = 1, j
1000 gs%local_dof_gs(i) = dof_array(i)
1001 end do
1002 end select
1003 call local_dof%free()
1004
1005 ! Add dofs on faces
1006
1007 ! We should use the %array() procedure, which works great for
1008 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1009 ! certain data types
1010 select type(dof_array => local_face_dof%data)
1011 type is (integer)
1012 do i = 1, local_face_dof%size()
1013 gs%local_dof_gs(i + j) = dof_array(i)
1014 end do
1015 end select
1016 call local_face_dof%free()
1017
1018 ! Finalize local gather-scatter index to dof
1019 allocate(gs%local_gs_dof(gs%nlocal))
1020
1021 ! Add gather-scatter index on points and edges
1022
1023 ! We should use the %array() procedure, which works great for
1024 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1025 ! certain data types
1026 select type(dof_array => dof_local%data)
1027 type is (integer)
1028 j = dof_local%size()
1029 do i = 1, j
1030 gs%local_gs_dof(i) = dof_array(i)
1031 end do
1032 end select
1033 call dof_local%free()
1034
1035 ! We should use the %array() procedure, which works great for
1036 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1037 ! certain data types
1038 select type(dof_array => face_dof_local%data)
1039 type is (integer)
1040 do i = 1, face_dof_local%size()
1041 gs%local_gs_dof(i+j) = dof_array(i)
1042 end do
1043 end select
1044 call face_dof_local%free()
1045
1046 call gs_qsort_dofmap(gs%local_dof_gs, gs%local_gs_dof, &
1047 gs%nlocal, 1, gs%nlocal)
1048
1049 call gs_find_blks(gs%local_dof_gs, gs%local_blk_len, &
1050 gs%nlocal_blks, gs%nlocal, gs%local_facet_offset)
1051
1052 ! Allocate buffer for local gs-ops
1053 allocate(gs%local_gs(gs%nlocal))
1054
1055 gs%nshared = shared_dof%size() + shared_face_dof%size()
1056 gs%shared_facet_offset = shared_dof%size() + 1
1057
1058 ! Finalize shared dof to gather-scatter index
1059 allocate(gs%shared_dof_gs(gs%nshared))
1060
1061 ! Add shared dofs on points and edges
1062
1063 ! We should use the %array() procedure, which works great for
1064 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1065 ! certain data types
1066 select type(dof_array => shared_dof%data)
1067 type is (integer)
1068 j = shared_dof%size()
1069 do i = 1, j
1070 gs%shared_dof_gs(i) = dof_array(i)
1071 end do
1072 end select
1073 call shared_dof%free()
1074
1075 ! Add shared dofs on faces
1076
1077 ! We should use the %array() procedure, which works great for
1078 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1079 ! certain data types
1080 select type(dof_array => shared_face_dof%data)
1081 type is (integer)
1082 do i = 1, shared_face_dof%size()
1083 gs%shared_dof_gs(i + j) = dof_array(i)
1084 end do
1085 end select
1086 call shared_face_dof%free()
1087
1088 ! Finalize shared gather-scatter index to dof
1089 allocate(gs%shared_gs_dof(gs%nshared))
1090
1091 ! Add dofs on points and edges
1092
1093 ! We should use the %array() procedure, which works great for
1094 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1095 ! certain data types
1096 select type(dof_array => dof_shared%data)
1097 type is(integer)
1098 j = dof_shared%size()
1099 do i = 1, j
1100 gs%shared_gs_dof(i) = dof_array(i)
1101 end do
1102 end select
1103 call dof_shared%free()
1104
1105 ! We should use the %array() procedure, which works great for
1106 ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1107 ! certain data types
1108 select type(dof_array => face_dof_shared%data)
1109 type is (integer)
1110 do i = 1, face_dof_shared%size()
1111 gs%shared_gs_dof(i + j) = dof_array(i)
1112 end do
1113 end select
1114 call face_dof_shared%free()
1115
1116 ! Allocate buffer for shared gs-ops
1117 allocate(gs%shared_gs(gs%nshared))
1118
1119 if (gs%nshared .gt. 0) then
1120 call gs_qsort_dofmap(gs%shared_dof_gs, gs%shared_gs_dof, &
1121 gs%nshared, 1, gs%nshared)
1122
1123 call gs_find_blks(gs%shared_dof_gs, gs%shared_blk_len, &
1124 gs%nshared_blks, gs%nshared, gs%shared_facet_offset)
1125 end if
1126
1127 contains
1128
1137 function gs_mapping_add_dof(map_, dof, max_id) result(id)
1138 type(htable_i8_t), intent(inout) :: map_
1139 integer(kind=i8), intent(inout) :: dof
1140 integer, intent(inout) :: max_id
1141 integer :: id
1142
1143 if (map_%get(dof, id) .gt. 0) then
1144 max_id = max_id + 1
1145 call map_%set(dof, max_id)
1146 id = max_id
1147 end if
1148
1149 end function gs_mapping_add_dof
1150
1152 recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
1153 integer, intent(inout) :: n
1154 integer, dimension(n), intent(inout) :: dg
1155 integer, dimension(n), intent(inout) :: gd
1156 integer :: lo, hi
1157 integer :: tmp, i, j, pivot
1158
1159 i = lo - 1
1160 j = hi + 1
1161 pivot = dg((lo + hi) / 2)
1162 do
1163 do
1164 i = i + 1
1165 if (dg(i) .ge. pivot) exit
1166 end do
1167
1168 do
1169 j = j - 1
1170 if (dg(j) .le. pivot) exit
1171 end do
1172
1173 if (i .lt. j) then
1174 tmp = dg(i)
1175 dg(i) = dg(j)
1176 dg(j) = tmp
1177
1178 tmp = gd(i)
1179 gd(i) = gd(j)
1180 gd(j) = tmp
1181 else if (i .eq. j) then
1182 i = i + 1
1183 exit
1184 else
1185 exit
1186 end if
1187 end do
1188 if (lo .lt. j) call gs_qsort_dofmap(dg, gd, n, lo, j)
1189 if (i .lt. hi) call gs_qsort_dofmap(dg, gd, n, i, hi)
1190
1191 end subroutine gs_qsort_dofmap
1192
1194 subroutine gs_find_blks(dg, blk_len, nblks, n, m)
1195 integer, intent(in) :: n
1196 integer, intent(in) :: m
1197 integer, dimension(n), intent(inout) :: dg
1198 integer, allocatable, intent(inout) :: blk_len(:)
1199 integer, intent(inout) :: nblks
1200 integer :: i, j
1201 integer :: id, count
1202 type(stack_i4_t), target :: blks
1203
1204 call blks%init()
1205 i = 1
1206 do while( i .lt. m)
1207 id = dg(i)
1208 count = 1
1209 j = i
1210 do while ( j+1 .le. n .and. dg(j+1) .eq. id)
1211 j = j + 1
1212 count = count + 1
1213 end do
1214 call blks%push(count)
1215 i = j + 1
1216 end do
1217
1218 select type(blk_array => blks%data)
1219 type is(integer)
1220 nblks = blks%size()
1221 allocate(blk_len(nblks))
1222 do i = 1, nblks
1223 blk_len(i) = blk_array(i)
1224 end do
1225 end select
1226 call blks%free()
1227
1228 end subroutine gs_find_blks
1229
1230 end subroutine gs_init_mapping
1231
1233 subroutine gs_schedule(gs)
1234 type(gs_t), target, intent(inout) :: gs
1235 integer(kind=i8), allocatable :: send_buf(:), recv_buf(:)
1236 integer(kind=i2), allocatable :: shared_flg(:), recv_flg(:)
1237 type(htable_iter_i8_t) :: it
1238 type(stack_i4_t) :: send_pe, recv_pe
1239 type(mpi_status) :: status
1240 type(mpi_request) :: send_req, recv_req
1241 integer :: i, j, max_recv, src, dst, ierr, n_recv
1242 integer :: tmp, shared_gs_id
1243 integer :: nshared_unique
1244
1245 nshared_unique = gs%shared_dofs%num_entries()
1246
1247 call it%init(gs%shared_dofs)
1248 allocate(send_buf(nshared_unique))
1249 i = 1
1250 do while(it%next())
1251 send_buf(i) = it%key()
1252 i = i + 1
1253 end do
1254
1255 call send_pe%init()
1256 call recv_pe%init()
1257
1258
1259 !
1260 ! Schedule exchange of shared dofs
1261 !
1262
1263 call mpi_allreduce(nshared_unique, max_recv, 1, &
1264 mpi_integer, mpi_max, neko_comm, ierr)
1265
1266 allocate(recv_buf(max_recv))
1267 allocate(shared_flg(max_recv))
1268 allocate(recv_flg(max_recv))
1269
1271 do i = 1, size(gs%dofmap%msh%neigh_order)
1272 src = modulo(pe_rank - gs%dofmap%msh%neigh_order(i) + pe_size, pe_size)
1273 dst = modulo(pe_rank + gs%dofmap%msh%neigh_order(i), pe_size)
1274
1275 if (gs%dofmap%msh%neigh(src)) then
1276 call mpi_irecv(recv_buf, max_recv, mpi_integer8, &
1277 src, 0, neko_comm, recv_req, ierr)
1278 end if
1279
1280 if (gs%dofmap%msh%neigh(dst)) then
1281 call mpi_isend(send_buf, nshared_unique, mpi_integer8, &
1282 dst, 0, neko_comm, send_req, ierr)
1283 end if
1284
1285 if (gs%dofmap%msh%neigh(src)) then
1286 call mpi_wait(recv_req, status, ierr)
1287 call mpi_get_count(status, mpi_integer8, n_recv, ierr)
1288
1289 do j = 1, n_recv
1290 shared_flg(j) = gs%shared_dofs%get(recv_buf(j), shared_gs_id)
1291 if (shared_flg(j) .eq. 0) then
1293 call gs%comm%recv_dof(src)%push(shared_gs_id)
1294 end if
1295 end do
1296
1297 if (gs%comm%recv_dof(src)%size() .gt. 0) then
1298 call recv_pe%push(src)
1299 end if
1300 end if
1301
1302 if (gs%dofmap%msh%neigh(dst)) then
1303 call mpi_wait(send_req, mpi_status_ignore, ierr)
1304 call mpi_irecv(recv_flg, max_recv, mpi_integer2, &
1305 dst, 0, neko_comm, recv_req, ierr)
1306 end if
1307
1308 if (gs%dofmap%msh%neigh(src)) then
1309 call mpi_isend(shared_flg, n_recv, mpi_integer2, &
1310 src, 0, neko_comm, send_req, ierr)
1311 end if
1312
1313 if (gs%dofmap%msh%neigh(dst)) then
1314 call mpi_wait(recv_req, status, ierr)
1315 call mpi_get_count(status, mpi_integer2, n_recv, ierr)
1316
1317 do j = 1, n_recv
1318 if (recv_flg(j) .eq. 0) then
1319 tmp = gs%shared_dofs%get(send_buf(j), shared_gs_id)
1321 call gs%comm%send_dof(dst)%push(shared_gs_id)
1322 end if
1323 end do
1324
1325 if (gs%comm%send_dof(dst)%size() .gt. 0) then
1326 call send_pe%push(dst)
1327 end if
1328 end if
1329
1330 if (gs%dofmap%msh%neigh(src)) then
1331 call mpi_wait(send_req, mpi_status_ignore, ierr)
1332 end if
1333
1334 end do
1335
1336 call gs%comm%init(send_pe, recv_pe)
1337
1338 call send_pe%free()
1339 call recv_pe%free()
1340
1341 deallocate(send_buf)
1342 deallocate(recv_flg)
1343 deallocate(shared_flg)
1344 !This arrays seems to take massive amounts of memory...
1345 call gs%shared_dofs%free()
1346
1347 end subroutine gs_schedule
1348
1350 subroutine gs_op_fld(gs, u, op, event)
1351 class(gs_t), intent(inout) :: gs
1352 type(field_t), intent(inout) :: u
1353 type(c_ptr), optional, intent(inout) :: event
1354 integer :: n, op
1355
1356 n = u%msh%nelv * u%Xh%lx * u%Xh%ly * u%Xh%lz
1357 if (present(event)) then
1358 call gs_op_vector(gs, u%x, n, op, event)
1359 else
1360 call gs_op_vector(gs, u%x, n, op)
1361 end if
1362
1363 end subroutine gs_op_fld
1364
1366 subroutine gs_op_r4(gs, u, n, op, event)
1367 class(gs_t), intent(inout) :: gs
1368 integer, intent(in) :: n
1369 real(kind=rp), contiguous, dimension(:,:,:,:), intent(inout) :: u
1370 type(c_ptr), optional, intent(inout) :: event
1371 integer :: op
1372
1373 if (present(event)) then
1374 call gs_op_vector(gs, u, n, op, event)
1375 else
1376 call gs_op_vector(gs, u, n, op)
1377 end if
1378
1379 end subroutine gs_op_r4
1380
1382 subroutine gs_op_vector(gs, u, n, op, event)
1383 class(gs_t), intent(inout) :: gs
1384 integer, intent(in) :: n
1385 real(kind=rp), dimension(n), intent(inout) :: u
1386 type(c_ptr), optional, intent(inout) :: event
1387 integer :: m, l, op, lo, so
1388
1389 lo = gs%local_facet_offset
1390 so = -gs%shared_facet_offset
1391 m = gs%nlocal
1392 l = gs%nshared
1393
1394 call profiler_start_region("gather_scatter", 5)
1395 ! Gather shared dofs
1396 if (pe_size .gt. 1) then
1397 call profiler_start_region("gs_nbrecv", 13)
1398 call gs%comm%nbrecv()
1399 call profiler_end_region("gs_nbrecv", 13)
1400 call profiler_start_region("gs_gather_shared", 14)
1401 call gs%bcknd%gather(gs%shared_gs, l, so, gs%shared_dof_gs, u, n, &
1402 gs%shared_gs_dof, gs%nshared_blks, gs%shared_blk_len, op, .true.)
1403 call profiler_end_region("gs_gather_shared", 14)
1404 call profiler_start_region("gs_nbsend", 6)
1405 call gs%comm%nbsend(gs%shared_gs, l, &
1406 gs%bcknd%gather_event, gs%bcknd%gs_stream)
1407 call profiler_end_region("gs_nbsend", 6)
1408
1409 end if
1410
1411 ! Gather-scatter local dofs
1412 call profiler_start_region("gs_local", 12)
1413 call gs%bcknd%gather(gs%local_gs, m, lo, gs%local_dof_gs, u, n, &
1414 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, op, .false.)
1415 call gs%bcknd%scatter(gs%local_gs, m, gs%local_dof_gs, u, n, &
1416 gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, .false., c_null_ptr)
1417 call profiler_end_region("gs_local", 12)
1418 ! Scatter shared dofs
1419 if (pe_size .gt. 1) then
1420 call profiler_start_region("gs_nbwait", 7)
1421 call gs%comm%nbwait(gs%shared_gs, l, op, gs%bcknd%gs_stream)
1422 call profiler_end_region("gs_nbwait", 7)
1423 call profiler_start_region("gs_scatter_shared", 15)
1424 if (present(event)) then
1425 call gs%bcknd%scatter(gs%shared_gs, l,&
1426 gs%shared_dof_gs, u, n, &
1427 gs%shared_gs_dof, gs%nshared_blks, &
1428 gs%shared_blk_len, .true., event)
1429 else
1430 call gs%bcknd%scatter(gs%shared_gs, l,&
1431 gs%shared_dof_gs, u, n, &
1432 gs%shared_gs_dof, gs%nshared_blks, &
1433 gs%shared_blk_len, .true., c_null_ptr)
1434 end if
1435 call profiler_end_region("gs_scatter_shared", 15)
1436 end if
1437
1438 call profiler_end_region("gather_scatter", 5)
1439
1440 end subroutine gs_op_vector
1441
1442end module gather_scatter
subroutine gs_find_blks(dg, blk_len, nblks, n, m)
Find blocks sharing dofs in non-facet data.
recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
Sort the dof lists based on the dof to gather-scatter list.
integer function gs_mapping_add_dof(map_, dof, max_id)
Register a unique dof Takes the unique id dof and checks if it is in the htable map_ If it is we retu...
Deassociate a Fortran array from a device pointer.
Definition device.F90:90
Map a Fortran array to a device (allocate and associate)
Definition device.F90:72
Copy data between host and device (or device and device)
Definition device.F90:66
Synchronize a device or stream.
Definition device.F90:102
Definition comm.F90:1
integer, public pe_size
MPI size of communicator.
Definition comm.F90:58
integer, public pe_rank
MPI rank.
Definition comm.F90:55
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:42
Device abstraction, common interface for various accelerators.
Definition device.F90:34
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:214
Defines a mapping of the degrees of freedom.
Definition dofmap.f90:35
Defines a field.
Definition field.f90:34
Gather-scatter.
subroutine gs_init_mapping(gs)
Setup mapping of dofs to gather-scatter operations.
subroutine gs_schedule(gs)
Schedule shared gather-scatter operations.
subroutine gs_free(gs)
Deallocate a gather-scatter kernel.
subroutine gs_op_r4(gs, u, n, op, event)
Gather-scatter operation on a rank 4 array.
subroutine gs_init(gs, dofmap, bcknd, comm_bcknd)
Initialize a gather-scatter kernel.
subroutine gs_op_vector(gs, u, n, op, event)
Gather-scatter operation on a vector u with op op.
subroutine gs_op_fld(gs, u, op, event)
Gather-scatter operation on a field u with op op.
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
integer, parameter, public gs_bcknd_cpu
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_sx
Definition gs_bcknd.f90:40
integer, parameter, public gs_bcknd_dev
Definition gs_bcknd.f90:40
Defines a gather-scatter communication method.
Definition gs_comm.f90:34
integer, parameter, public gs_comm_mpigpu
Definition gs_comm.f90:42
integer, parameter, public gs_comm_mpi
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nvshmem
Definition gs_comm.f90:42
integer, parameter, public gs_comm_nccl
Definition gs_comm.f90:42
Generic Gather-scatter backend for CPUs.
Definition gs_cpu.f90:34
Defines GPU aware MPI gather-scatter communication.
Defines NCCL based gather-scatter communication.
Defines GPU aware MPI gather-scatter communication.
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
Defines MPI gather-scatter communication.
Definition gs_mpi.f90:34
Defines Gather-scatter operations.
Definition gs_ops.f90:34
integer, parameter, public gs_op_add
Definition gs_ops.f90:36
integer, parameter, public gs_op_max
Definition gs_ops.f90:36
integer, parameter, public gs_op_min
Definition gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition gs_ops.f90:36
Generic Gather-scatter backend for NEC Vector Engines.
Definition gs_sx.f90:34
Implements a hash table ADT.
Definition htable.f90:36
Logging routines.
Definition log.f90:34
type(log_t), public neko_log
Global log stream.
Definition log.f90:70
integer, parameter, public log_size
Definition log.f90:46
Defines a mesh.
Definition mesh.f90:34
Build configurations.
integer, parameter neko_bcknd_sx
integer, parameter neko_bcknd_hip
integer, parameter neko_bcknd_device
integer, parameter neko_bcknd_opencl
logical, parameter neko_device_mpi
integer, parameter neko_bcknd_cuda
integer, parameter, public i2
Definition num_types.f90:5
integer, parameter, public i8
Definition num_types.f90:7
integer, parameter, public dp
Definition num_types.f90:9
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Profiling interface.
Definition profiler.F90:34
subroutine, public profiler_start_region(name, region_id)
Started a named (name) profiler region.
Definition profiler.F90:78
subroutine, public profiler_end_region(name, region_id)
End the most recently started profiler region.
Definition profiler.F90:109
Implements a dynamic stack ADT.
Definition stack.f90:35
Utilities.
Definition utils.f90:35
pure integer function, public linear_index(i, j, k, l, lx, ly, lz)
Compute the address of a (i,j,k,l) array with sizes (1:lx, 1:ly, 1:lz, :)
Definition utils.f90:175
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter communication method.
Definition gs_comm.f90:46
Gather-scatter backend for CPUs.
Definition gs_cpu.f90:43
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Gather-scatter communication using NCCL The arrays are indexed per PE like send_pe and @ recv_pe.
Gather-scatter communication using device SHMEM. The arrays are indexed per PE like send_pe and @ rec...
Gather-scatter communication using MPI.
Definition gs_mpi.f90:61
Gather-scatter backend for NEC SX-Aurora.
Definition gs_sx.f90:43
Integer*8 based hash table.
Definition htable.f90:92
Iterator for an integer*8 based hash table.
Definition htable.f90:175
Integer based stack.
Definition stack.f90:63