Neko  0.8.99
A portable framework for high-order spectral element flow simulations
gather_scatter.f90
Go to the documentation of this file.
1 ! Copyright (c) 2020-2023, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
35  use neko_config
37  use gs_device, only : gs_device_t
38  use gs_sx, only : gs_sx_t
39  use gs_cpu, only : gs_cpu_t
41  use gs_comm, only : gs_comm_t
42  use gs_mpi, only : gs_mpi_t
43  use gs_device_mpi, only : gs_device_mpi_t
44  use mesh, only : mesh_t
45  use comm
46  use dofmap, only : dofmap_t
47  use field, only : field_t
48  use num_types, only : rp, dp, i2
50  use stack, only : stack_i4_t
51  use utils, only : neko_error, linear_index
52  use logger, only : neko_log, log_size
54  use device
55  implicit none
56  private
57 
58  type, public :: gs_t
59  real(kind=rp), allocatable :: local_gs(:)
60  integer, allocatable :: local_dof_gs(:)
61  integer, allocatable :: local_gs_dof(:)
62  integer, allocatable :: local_blk_len(:)
63  real(kind=rp), allocatable :: shared_gs(:)
64  integer, allocatable :: shared_dof_gs(:)
65  integer, allocatable :: shared_gs_dof(:)
66  integer, allocatable :: shared_blk_len(:)
67  type(dofmap_t), pointer ::dofmap
68  type(htable_i8_t) :: shared_dofs
69  integer :: nlocal
70  integer :: nshared
71  integer :: nlocal_blks
72  integer :: nshared_blks
73  integer :: local_facet_offset
74  integer :: shared_facet_offset
75  class(gs_bcknd_t), allocatable :: bcknd
76  class(gs_comm_t), allocatable :: comm
77  contains
78  procedure, private, pass(gs) :: gs_op_fld
79  procedure, private, pass(gs) :: gs_op_r4
80  procedure, pass(gs) :: gs_op_vector
81  procedure, pass(gs) :: init => gs_init
82  procedure, pass(gs) :: free => gs_free
83  generic :: op => gs_op_fld, gs_op_r4, gs_op_vector
84  end type gs_t
85 
87 
88 contains
89 
91  subroutine gs_init(gs, dofmap, bcknd)
92  class(gs_t), intent(inout) :: gs
93  type(dofmap_t), target, intent(inout) :: dofmap
94  character(len=LOG_SIZE) :: log_buf
95  character(len=20) :: bcknd_str
96  integer, optional :: bcknd
97  integer :: i, j, ierr, bcknd_
98  integer(i8) :: glb_nshared, glb_nlocal
99  logical :: use_device_mpi
100  real(kind=rp), allocatable :: tmp(:)
101  type(c_ptr) :: tmp_d = c_null_ptr
102  integer :: strtgy(4) = (/ int(b'00'), int(b'01'), int(b'10'), int(b'11') /)
103  integer :: avg_strtgy, env_len
104  character(len=255) :: env_strtgy
105  real(kind=dp) :: strtgy_time(4)
106 
107  call gs%free()
108 
109  call neko_log%section('Gather-Scatter')
110 
111  gs%dofmap => dofmap
112 
113  ! Here one could use some heuristic or autotuning to select comm method,
114  ! such as only using device MPI when there is enough data.
115  !use_device_mpi = NEKO_DEVICE_MPI .and. gs%nshared .gt. 20000
116  use_device_mpi = neko_device_mpi
117 
118  if (use_device_mpi) then
119  call neko_log%message('Comm : Device MPI')
120  allocate(gs_device_mpi_t::gs%comm)
121  else
122  call neko_log%message('Comm : MPI')
123  allocate(gs_mpi_t::gs%comm)
124  end if
125 
126  call gs%comm%init_dofs()
127  call gs_init_mapping(gs)
128 
129  call gs_schedule(gs)
130 
131  glb_nlocal = int(gs%nlocal, i8)
132  glb_nshared = int(gs%nshared, i8)
133 
134  if (pe_rank .eq. 0) then
135  call mpi_reduce(mpi_in_place, glb_nlocal, 1, &
136  mpi_integer8, mpi_sum, 0, neko_comm, ierr)
137 
138  call mpi_reduce(mpi_in_place, glb_nshared, 1, &
139  mpi_integer8, mpi_sum, 0, neko_comm, ierr)
140  else
141  call mpi_reduce(glb_nlocal, glb_nlocal, 1, &
142  mpi_integer8, mpi_sum, 0, neko_comm, ierr)
143 
144  call mpi_reduce(glb_nshared, glb_nshared, 1, &
145  mpi_integer8, mpi_sum, 0, neko_comm, ierr)
146  end if
147 
148  write(log_buf, '(A,I12)') 'Avg. internal: ', glb_nlocal/pe_size
149  call neko_log%message(log_buf)
150  write(log_buf, '(A,I12)') 'Avg. external: ', glb_nshared/pe_size
151  call neko_log%message(log_buf)
152 
153  if (present(bcknd)) then
154  bcknd_ = bcknd
155  else
156  if (neko_bcknd_sx .eq. 1) then
157  bcknd_ = gs_bcknd_sx
158  else if (neko_bcknd_device .eq. 1) then
159  bcknd_ = gs_bcknd_dev
160  else
161  bcknd_ = gs_bcknd_cpu
162  end if
163  end if
164 
165  ! Setup Gather-scatter backend
166  select case(bcknd_)
167  case(gs_bcknd_cpu)
168  allocate(gs_cpu_t::gs%bcknd)
169  bcknd_str = ' std'
170  case(gs_bcknd_dev)
171  allocate(gs_device_t::gs%bcknd)
172  if (neko_bcknd_hip .eq. 1) then
173  bcknd_str = ' hip'
174  else if (neko_bcknd_cuda .eq. 1) then
175  bcknd_str = ' cuda'
176  else if (neko_bcknd_opencl .eq. 1) then
177  bcknd_str = ' opencl'
178  end if
179  case(gs_bcknd_sx)
180  allocate(gs_sx_t::gs%bcknd)
181  bcknd_str = ' sx'
182  case default
183  call neko_error('Unknown Gather-scatter backend')
184  end select
185 
186  write(log_buf, '(A)') 'Backend : ' // trim(bcknd_str)
187  call neko_log%message(log_buf)
188 
189 
190 
191  call gs%bcknd%init(gs%nlocal, gs%nshared, gs%nlocal_blks, gs%nshared_blks)
192 
193  if (use_device_mpi) then
194  select type(b => gs%bcknd)
195  type is (gs_device_t)
196  b%shared_on_host = .false.
197  end select
198 
199  if(pe_size .gt. 1) then
200  ! Select fastest device MPI strategy at runtime
201  select type(c => gs%comm)
202  type is (gs_device_mpi_t)
203  call get_environment_variable("NEKO_GS_STRTGY", env_strtgy, env_len)
204  if (env_len .eq. 0) then
205  allocate(tmp(dofmap%size()))
206  call device_map(tmp, tmp_d, dofmap%size())
207  tmp = 1.0_rp
208  call device_memcpy(tmp, tmp_d, dofmap%size(), &
209  host_to_device, sync=.false.)
210  call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
211 
212  do i = 1, size(strtgy)
213  c%nb_strtgy = strtgy(i)
214  call device_sync
215  call mpi_barrier(neko_comm)
216  strtgy_time(i) = mpi_wtime()
217  do j = 1, 100
218  call gs_op_vector(gs, tmp, dofmap%size(), gs_op_add)
219  end do
220  strtgy_time(i) = (mpi_wtime() - strtgy_time(i)) / 100d0
221  end do
222 
223  call device_deassociate(tmp)
224  call device_free(tmp_d)
225  deallocate(tmp)
226 
227  c%nb_strtgy = strtgy(minloc(strtgy_time, 1))
228 
229  avg_strtgy = minloc(strtgy_time, 1)
230  call mpi_allreduce(mpi_in_place, avg_strtgy, 1, &
231  mpi_integer, mpi_sum, neko_comm)
232  avg_strtgy = avg_strtgy / pe_size
233 
234  write(log_buf, '(A,B0.2,A)') 'Avg. strtgy : [', &
235  strtgy(avg_strtgy),']'
236 
237  else
238  read(env_strtgy(1:env_len), *) i
239 
240  if (i .lt. 1 .or. i .gt. 4) then
241  call neko_error('Invalid gs sync strtgy')
242  end if
243 
244  c%nb_strtgy = strtgy(i)
245  avg_strtgy = i
246 
247  write(log_buf, '(A,B0.2,A)') 'Env. strtgy : [', &
248  strtgy(avg_strtgy),']'
249  end if
250 
251  call neko_log%message(log_buf)
252 
253  end select
254  end if
255  end if
256 
257  call neko_log%end_section()
258 
259  end subroutine gs_init
260 
262  subroutine gs_free(gs)
263  class(gs_t), intent(inout) :: gs
264 
265  nullify(gs%dofmap)
266 
267  if (allocated(gs%local_gs)) then
268  deallocate(gs%local_gs)
269  end if
270 
271  if (allocated(gs%local_dof_gs)) then
272  deallocate(gs%local_dof_gs)
273  end if
274 
275  if (allocated(gs%local_gs_dof)) then
276  deallocate(gs%local_gs_dof)
277  end if
278 
279  if (allocated(gs%local_blk_len)) then
280  deallocate(gs%local_blk_len)
281  end if
282 
283  if (allocated(gs%shared_gs)) then
284  deallocate(gs%shared_gs)
285  end if
286 
287  if (allocated(gs%shared_dof_gs)) then
288  deallocate(gs%shared_dof_gs)
289  end if
290 
291  if (allocated(gs%shared_gs_dof)) then
292  deallocate(gs%shared_gs_dof)
293  end if
294 
295  if (allocated(gs%shared_blk_len)) then
296  deallocate(gs%shared_blk_len)
297  end if
298 
299  gs%nlocal = 0
300  gs%nshared = 0
301  gs%nlocal_blks = 0
302  gs%nshared_blks = 0
303 
304  call gs%shared_dofs%free()
305 
306  if (allocated(gs%bcknd)) then
307  call gs%bcknd%free()
308  deallocate(gs%bcknd)
309  end if
310 
311  if (allocated(gs%comm)) then
312  call gs%comm%free()
313  deallocate(gs%comm)
314  end if
315 
316  end subroutine gs_free
317 
319  subroutine gs_init_mapping(gs)
320  type(gs_t), target, intent(inout) :: gs
321  type(mesh_t), pointer :: msh
322  type(dofmap_t), pointer :: dofmap
323  type(stack_i4_t), target :: local_dof, dof_local, shared_dof, dof_shared
324  type(stack_i4_t), target :: local_face_dof, face_dof_local
325  type(stack_i4_t), target :: shared_face_dof, face_dof_shared
326  integer :: i, j, k, l, lx, ly, lz, max_id, max_sid, id, lid, dm_size
327  type(htable_i8_t) :: dm
328  type(htable_i8_t), pointer :: sdm
329 
330  dofmap => gs%dofmap
331  msh => dofmap%msh
332  sdm => gs%shared_dofs
333 
334  lx = dofmap%Xh%lx
335  ly = dofmap%Xh%ly
336  lz = dofmap%Xh%lz
337  dm_size = dofmap%size()/lx
338 
339  call dm%init(dm_size, i)
342  call sdm%init(dofmap%size(), i)
343 
344 
345  call local_dof%init()
346  call dof_local%init()
347 
348  call local_face_dof%init()
349  call face_dof_local%init()
350 
351  call shared_dof%init()
352  call dof_shared%init()
353 
354  call shared_face_dof%init()
355  call face_dof_shared%init()
356 
357  !
358  ! Setup mapping for dofs points
359  !
360 
361  max_id = 0
362  max_sid = 0
363  do i = 1, msh%nelv
364  lid = linear_index(1, 1, 1, i, lx, ly, lz)
365  if (dofmap%shared_dof(1, 1, 1, i)) then
366  id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, 1, i), max_sid)
367  call shared_dof%push(id)
368  call dof_shared%push(lid)
369  else
370  id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, 1, i), max_id)
371  call local_dof%push(id)
372  call dof_local%push(lid)
373  end if
374 
375  lid = linear_index(lx, 1, 1, i, lx, ly, lz)
376  if (dofmap%shared_dof(lx, 1, 1, i)) then
377  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, 1, i), max_sid)
378  call shared_dof%push(id)
379  call dof_shared%push(lid)
380  else
381  id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, 1, i), max_id)
382  call local_dof%push(id)
383  call dof_local%push(lid)
384  end if
385 
386  lid = linear_index(1, ly, 1, i, lx, ly, lz)
387  if (dofmap%shared_dof(1, ly, 1, i)) then
388  id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, 1, i), max_sid)
389  call shared_dof%push(id)
390  call dof_shared%push(lid)
391  else
392  id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, 1, i), max_id)
393  call local_dof%push(id)
394  call dof_local%push(lid)
395  end if
396 
397  lid = linear_index(lx, ly, 1, i, lx, ly, lz)
398  if (dofmap%shared_dof(lx, ly, 1, i)) then
399  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, 1, i), max_sid)
400  call shared_dof%push(id)
401  call dof_shared%push(lid)
402  else
403  id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, 1, i), max_id)
404  call local_dof%push(id)
405  call dof_local%push(lid)
406  end if
407  if (lz .gt. 1) then
408  lid = linear_index(1, 1, lz, i, lx, ly, lz)
409  if (dofmap%shared_dof(1, 1, lz, i)) then
410  id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, lz, i), max_sid)
411  call shared_dof%push(id)
412  call dof_shared%push(lid)
413  else
414  id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, lz, i), max_id)
415  call local_dof%push(id)
416  call dof_local%push(lid)
417  end if
418 
419  lid = linear_index(lx, 1, lz, i, lx, ly, lz)
420  if (dofmap%shared_dof(lx, 1, lz, i)) then
421  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, lz, i), max_sid)
422  call shared_dof%push(id)
423  call dof_shared%push(lid)
424  else
425  id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, lz, i), max_id)
426  call local_dof%push(id)
427  call dof_local%push(lid)
428  end if
429 
430  lid = linear_index(1, ly, lz, i, lx, ly, lz)
431  if (dofmap%shared_dof(1, ly, lz, i)) then
432  id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, lz, i), max_sid)
433  call shared_dof%push(id)
434  call dof_shared%push(lid)
435  else
436  id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, lz, i), max_id)
437  call local_dof%push(id)
438  call dof_local%push(lid)
439  end if
440 
441  lid = linear_index(lx, ly, lz, i, lx, ly, lz)
442  if (dofmap%shared_dof(lx, ly, lz, i)) then
443  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, lz, i), max_sid)
444  call shared_dof%push(id)
445  call dof_shared%push(lid)
446  else
447  id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, lz, i), max_id)
448  call local_dof%push(id)
449  call dof_local%push(lid)
450  end if
451  end if
452  end do
453  if (lz .gt. 1) then
454  !
455  ! Setup mapping for dofs on edges
456  !
457  do i = 1, msh%nelv
458 
459  !
460  ! dofs on edges in x-direction
461  !
462  if (dofmap%shared_dof(2, 1, 1, i)) then
463  do j = 2, lx - 1
464  id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
465  call shared_dof%push(id)
466  id = linear_index(j, 1, 1, i, lx, ly, lz)
467  call dof_shared%push(id)
468  end do
469  else
470  do j = 2, lx - 1
471  id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
472  call local_dof%push(id)
473  id = linear_index(j, 1, 1, i, lx, ly, lz)
474  call dof_local%push(id)
475  end do
476  end if
477  if (dofmap%shared_dof(2, 1, lz, i)) then
478  do j = 2, lx - 1
479  id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, lz, i), max_sid)
480  call shared_dof%push(id)
481  id = linear_index(j, 1, lz, i, lx, ly, lz)
482  call dof_shared%push(id)
483  end do
484  else
485  do j = 2, lx - 1
486  id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, lz, i), max_id)
487  call local_dof%push(id)
488  id = linear_index(j, 1, lz, i, lx, ly, lz)
489  call dof_local%push(id)
490  end do
491  end if
492 
493  if (dofmap%shared_dof(2, ly, 1, i)) then
494  do j = 2, lx - 1
495  id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
496  call shared_dof%push(id)
497  id = linear_index(j, ly, 1, i, lx, ly, lz)
498  call dof_shared%push(id)
499  end do
500 
501  else
502  do j = 2, lx - 1
503  id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
504  call local_dof%push(id)
505  id = linear_index(j, ly, 1, i, lx, ly, lz)
506  call dof_local%push(id)
507  end do
508  end if
509  if (dofmap%shared_dof(2, ly, lz, i)) then
510  do j = 2, lx - 1
511  id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, lz, i), max_sid)
512  call shared_dof%push(id)
513  id = linear_index(j, ly, lz, i, lx, ly, lz)
514  call dof_shared%push(id)
515  end do
516  else
517  do j = 2, lx - 1
518  id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, lz, i), max_id)
519  call local_dof%push(id)
520  id = linear_index(j, ly, lz, i, lx, ly, lz)
521  call dof_local%push(id)
522  end do
523  end if
524 
525  !
526  ! dofs on edges in y-direction
527  !
528  if (dofmap%shared_dof(1, 2, 1, i)) then
529  do k = 2, ly - 1
530  id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
531  call shared_dof%push(id)
532  id = linear_index(1, k, 1, i, lx, ly, lz)
533  call dof_shared%push(id)
534  end do
535  else
536  do k = 2, ly - 1
537  id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
538  call local_dof%push(id)
539  id = linear_index(1, k, 1, i, lx, ly, lz)
540  call dof_local%push(id)
541  end do
542  end if
543  if (dofmap%shared_dof(1, 2, lz, i)) then
544  do k = 2, ly - 1
545  id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, lz, i), max_sid)
546  call shared_dof%push(id)
547  id = linear_index(1, k, lz, i, lx, ly, lz)
548  call dof_shared%push(id)
549  end do
550  else
551  do k = 2, ly - 1
552  id = gs_mapping_add_dof(dm, dofmap%dof(1, k, lz, i), max_id)
553  call local_dof%push(id)
554  id = linear_index(1, k, lz, i, lx, ly, lz)
555  call dof_local%push(id)
556  end do
557  end if
558 
559  if (dofmap%shared_dof(lx, 2, 1, i)) then
560  do k = 2, ly - 1
561  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
562  call shared_dof%push(id)
563  id = linear_index(lx, k, 1, i, lx, ly, lz)
564  call dof_shared%push(id)
565  end do
566  else
567  do k = 2, ly - 1
568  id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
569  call local_dof%push(id)
570  id = linear_index(lx, k, 1, i, lx, ly, lz)
571  call dof_local%push(id)
572  end do
573  end if
574  if (dofmap%shared_dof(lx, 2, lz, i)) then
575  do k = 2, ly - 1
576  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, lz, i), max_sid)
577  call shared_dof%push(id)
578  id = linear_index(lx, k, lz, i, lx, ly, lz)
579  call dof_shared%push(id)
580  end do
581  else
582  do k = 2, ly - 1
583  id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, lz, i), max_id)
584  call local_dof%push(id)
585  id = linear_index(lx, k, lz, i, lx, ly, lz)
586  call dof_local%push(id)
587  end do
588  end if
589  !
590  ! dofs on edges in z-direction
591  !
592  if (dofmap%shared_dof(1, 1, 2, i)) then
593  do l = 2, lz - 1
594  id = gs_mapping_add_dof(sdm, dofmap%dof(1, 1, l, i), max_sid)
595  call shared_dof%push(id)
596  id = linear_index(1, 1, l, i, lx, ly, lz)
597  call dof_shared%push(id)
598  end do
599  else
600  do l = 2, lz - 1
601  id = gs_mapping_add_dof(dm, dofmap%dof(1, 1, l, i), max_id)
602  call local_dof%push(id)
603  id = linear_index(1, 1, l, i, lx, ly, lz)
604  call dof_local%push(id)
605  end do
606  end if
607 
608  if (dofmap%shared_dof(lx, 1, 2, i)) then
609  do l = 2, lz - 1
610  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, 1, l, i), max_sid)
611  call shared_dof%push(id)
612  id = linear_index(lx, 1, l, i, lx, ly, lz)
613  call dof_shared%push(id)
614  end do
615  else
616  do l = 2, lz - 1
617  id = gs_mapping_add_dof(dm, dofmap%dof(lx, 1, l, i), max_id)
618  call local_dof%push(id)
619  id = linear_index(lx, 1, l, i, lx, ly, lz)
620  call dof_local%push(id)
621  end do
622  end if
623 
624  if (dofmap%shared_dof(1, ly, 2, i)) then
625  do l = 2, lz - 1
626  id = gs_mapping_add_dof(sdm, dofmap%dof(1, ly, l, i), max_sid)
627  call shared_dof%push(id)
628  id = linear_index(1, ly, l, i, lx, ly, lz)
629  call dof_shared%push(id)
630  end do
631  else
632  do l = 2, lz - 1
633  id = gs_mapping_add_dof(dm, dofmap%dof(1, ly, l, i), max_id)
634  call local_dof%push(id)
635  id = linear_index(1, ly, l, i, lx, ly, lz)
636  call dof_local%push(id)
637  end do
638  end if
639 
640  if (dofmap%shared_dof(lx, ly, 2, i)) then
641  do l = 2, lz - 1
642  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, ly, l, i), max_sid)
643  call shared_dof%push(id)
644  id = linear_index(lx, ly, l, i, lx, ly, lz)
645  call dof_shared%push(id)
646  end do
647  else
648  do l = 2, lz - 1
649  id = gs_mapping_add_dof(dm, dofmap%dof(lx, ly, l, i), max_id)
650  call local_dof%push(id)
651  id = linear_index(lx, ly, l, i, lx, ly, lz)
652  call dof_local%push(id)
653  end do
654  end if
655  end do
656  end if
657  !
658  ! Setup mapping for dofs on facets
659  !
660  if (lz .eq. 1) then
661  do i = 1, msh%nelv
662 
663  !
664  ! dofs on edges in x-direction
665  !
666  if (msh%facet_neigh(3, i) .ne. 0) then
667  if (dofmap%shared_dof(2, 1, 1, i)) then
668  do j = 2, lx - 1
669  id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, 1, i), max_sid)
670  call shared_face_dof%push(id)
671  id = linear_index(j, 1, 1, i, lx, ly, lz)
672  call face_dof_shared%push(id)
673  end do
674  else
675  do j = 2, lx - 1
676  id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, 1, i), max_id)
677  call local_face_dof%push(id)
678  id = linear_index(j, 1, 1, i, lx, ly, lz)
679  call face_dof_local%push(id)
680  end do
681  end if
682  end if
683 
684  if (msh%facet_neigh(4, i) .ne. 0) then
685  if (dofmap%shared_dof(2, ly, 1, i)) then
686  do j = 2, lx - 1
687  id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, 1, i), max_sid)
688  call shared_face_dof%push(id)
689  id = linear_index(j, ly, 1, i, lx, ly, lz)
690  call face_dof_shared%push(id)
691  end do
692 
693  else
694  do j = 2, lx - 1
695  id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, 1, i), max_id)
696  call local_face_dof%push(id)
697  id = linear_index(j, ly, 1, i, lx, ly, lz)
698  call face_dof_local%push(id)
699  end do
700  end if
701  end if
702 
703  !
704  ! dofs on edges in y-direction
705  !
706  if (msh%facet_neigh(1, i) .ne. 0) then
707  if (dofmap%shared_dof(1, 2, 1, i)) then
708  do k = 2, ly - 1
709  id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, 1, i), max_sid)
710  call shared_face_dof%push(id)
711  id = linear_index(1, k, 1, i, lx, ly, lz)
712  call face_dof_shared%push(id)
713  end do
714  else
715  do k = 2, ly - 1
716  id = gs_mapping_add_dof(dm, dofmap%dof(1, k, 1, i), max_id)
717  call local_face_dof%push(id)
718  id = linear_index(1, k, 1, i, lx, ly, lz)
719  call face_dof_local%push(id)
720  end do
721  end if
722  end if
723 
724  if (msh%facet_neigh(2, i) .ne. 0) then
725  if (dofmap%shared_dof(lx, 2, 1, i)) then
726  do k = 2, ly - 1
727  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, 1, i), max_sid)
728  call shared_face_dof%push(id)
729  id = linear_index(lx, k, 1, i, lx, ly, lz)
730  call face_dof_shared%push(id)
731  end do
732  else
733  do k = 2, ly - 1
734  id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, 1, i), max_id)
735  call local_face_dof%push(id)
736  id = linear_index(lx, k, 1, i, lx, ly, lz)
737  call face_dof_local%push(id)
738  end do
739  end if
740  end if
741  end do
742  else
743  do i = 1, msh%nelv
744 
745  ! Facets in x-direction (s, t)-plane
746  if (msh%facet_neigh(1, i) .ne. 0) then
747  if (dofmap%shared_dof(1, 2, 2, i)) then
748  do l = 2, lz - 1
749  do k = 2, ly - 1
750  id = gs_mapping_add_dof(sdm, dofmap%dof(1, k, l, i), max_sid)
751  call shared_face_dof%push(id)
752  id = linear_index(1, k, l, i, lx, ly, lz)
753  call face_dof_shared%push(id)
754  end do
755  end do
756  else
757  do l = 2, lz - 1
758  do k = 2, ly - 1
759  id = gs_mapping_add_dof(dm, dofmap%dof(1, k, l, i), max_id)
760  call local_face_dof%push(id)
761  id = linear_index(1, k, l, i, lx, ly, lz)
762  call face_dof_local%push(id)
763  end do
764  end do
765  end if
766  end if
767 
768  if (msh%facet_neigh(2, i) .ne. 0) then
769  if (dofmap%shared_dof(lx, 2, 2, i)) then
770  do l = 2, lz - 1
771  do k = 2, ly - 1
772  id = gs_mapping_add_dof(sdm, dofmap%dof(lx, k, l, i), max_sid)
773  call shared_face_dof%push(id)
774  id = linear_index(lx, k, l, i, lx, ly, lz)
775  call face_dof_shared%push(id)
776  end do
777  end do
778  else
779  do l = 2, lz - 1
780  do k = 2, ly - 1
781  id = gs_mapping_add_dof(dm, dofmap%dof(lx, k, l, i), max_id)
782  call local_face_dof%push(id)
783  id = linear_index(lx, k, l, i, lx, ly, lz)
784  call face_dof_local%push(id)
785  end do
786  end do
787  end if
788  end if
789 
790  ! Facets in y-direction (r, t)-plane
791  if (msh%facet_neigh(3, i) .ne. 0) then
792  if (dofmap%shared_dof(2, 1, 2, i)) then
793  do l = 2, lz - 1
794  do j = 2, lx - 1
795  id = gs_mapping_add_dof(sdm, dofmap%dof(j, 1, l, i), max_sid)
796  call shared_face_dof%push(id)
797  id = linear_index(j, 1, l, i, lx, ly, lz)
798  call face_dof_shared%push(id)
799  end do
800  end do
801  else
802  do l = 2, lz - 1
803  do j = 2, lx - 1
804  id = gs_mapping_add_dof(dm, dofmap%dof(j, 1, l, i), max_id)
805  call local_face_dof%push(id)
806  id = linear_index(j, 1, l, i, lx, ly, lz)
807  call face_dof_local%push(id)
808  end do
809  end do
810  end if
811  end if
812 
813  if (msh%facet_neigh(4, i) .ne. 0) then
814  if (dofmap%shared_dof(2, ly, 2, i)) then
815  do l = 2, lz - 1
816  do j = 2, lx - 1
817  id = gs_mapping_add_dof(sdm, dofmap%dof(j, ly, l, i), max_sid)
818  call shared_face_dof%push(id)
819  id = linear_index(j, ly, l, i, lx, ly, lz)
820  call face_dof_shared%push(id)
821  end do
822  end do
823  else
824  do l = 2, lz - 1
825  do j = 2, lx - 1
826  id = gs_mapping_add_dof(dm, dofmap%dof(j, ly, l, i), max_id)
827  call local_face_dof%push(id)
828  id = linear_index(j, ly, l, i, lx, ly, lz)
829  call face_dof_local%push(id)
830  end do
831  end do
832  end if
833  end if
834 
835  ! Facets in z-direction (r, s)-plane
836  if (msh%facet_neigh(5, i) .ne. 0) then
837  if (dofmap%shared_dof(2, 2, 1, i)) then
838  do k = 2, ly - 1
839  do j = 2, lx - 1
840  id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, 1, i), max_sid)
841  call shared_face_dof%push(id)
842  id = linear_index(j, k, 1, i, lx, ly, lz)
843  call face_dof_shared%push(id)
844  end do
845  end do
846  else
847  do k = 2, ly - 1
848  do j = 2, lx - 1
849  id = gs_mapping_add_dof(dm, dofmap%dof(j, k, 1, i), max_id)
850  call local_face_dof%push(id)
851  id = linear_index(j, k, 1, i, lx, ly, lz)
852  call face_dof_local%push(id)
853  end do
854  end do
855  end if
856  end if
857 
858  if (msh%facet_neigh(6, i) .ne. 0) then
859  if (dofmap%shared_dof(2, 2, lz, i)) then
860  do k = 2, ly - 1
861  do j = 2, lx - 1
862  id = gs_mapping_add_dof(sdm, dofmap%dof(j, k, lz, i), max_sid)
863  call shared_face_dof%push(id)
864  id = linear_index(j, k, lz, i, lx, ly, lz)
865  call face_dof_shared%push(id)
866  end do
867  end do
868  else
869  do k = 2, ly - 1
870  do j = 2, lx - 1
871  id = gs_mapping_add_dof(dm, dofmap%dof(j, k, lz, i), max_id)
872  call local_face_dof%push(id)
873  id = linear_index(j, k, lz, i, lx, ly, lz)
874  call face_dof_local%push(id)
875  end do
876  end do
877  end if
878  end if
879  end do
880  end if
881 
882 
883  call dm%free()
884 
885  gs%nlocal = local_dof%size() + local_face_dof%size()
886  gs%local_facet_offset = local_dof%size() + 1
887 
888  ! Finalize local dof to gather-scatter index
889  allocate(gs%local_dof_gs(gs%nlocal))
890 
891  ! Add dofs on points and edges
892 
893  ! We should use the %array() procedure, which works great for
894  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
895  ! certain data types
896  select type(dof_array => local_dof%data)
897  type is (integer)
898  j = local_dof%size()
899  do i = 1, j
900  gs%local_dof_gs(i) = dof_array(i)
901  end do
902  end select
903  call local_dof%free()
904 
905  ! Add dofs on faces
906 
907  ! We should use the %array() procedure, which works great for
908  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
909  ! certain data types
910  select type(dof_array => local_face_dof%data)
911  type is (integer)
912  do i = 1, local_face_dof%size()
913  gs%local_dof_gs(i + j) = dof_array(i)
914  end do
915  end select
916  call local_face_dof%free()
917 
918  ! Finalize local gather-scatter index to dof
919  allocate(gs%local_gs_dof(gs%nlocal))
920 
921  ! Add gather-scatter index on points and edges
922 
923  ! We should use the %array() procedure, which works great for
924  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
925  ! certain data types
926  select type(dof_array => dof_local%data)
927  type is (integer)
928  j = dof_local%size()
929  do i = 1, j
930  gs%local_gs_dof(i) = dof_array(i)
931  end do
932  end select
933  call dof_local%free()
934 
935  ! We should use the %array() procedure, which works great for
936  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
937  ! certain data types
938  select type(dof_array => face_dof_local%data)
939  type is (integer)
940  do i = 1, face_dof_local%size()
941  gs%local_gs_dof(i+j) = dof_array(i)
942  end do
943  end select
944  call face_dof_local%free()
945 
946  call gs_qsort_dofmap(gs%local_dof_gs, gs%local_gs_dof, &
947  gs%nlocal, 1, gs%nlocal)
948 
949  call gs_find_blks(gs%local_dof_gs, gs%local_blk_len, &
950  gs%nlocal_blks, gs%nlocal, gs%local_facet_offset)
951 
952  ! Allocate buffer for local gs-ops
953  allocate(gs%local_gs(gs%nlocal))
954 
955  gs%nshared = shared_dof%size() + shared_face_dof%size()
956  gs%shared_facet_offset = shared_dof%size() + 1
957 
958  ! Finalize shared dof to gather-scatter index
959  allocate(gs%shared_dof_gs(gs%nshared))
960 
961  ! Add shared dofs on points and edges
962 
963  ! We should use the %array() procedure, which works great for
964  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
965  ! certain data types
966  select type(dof_array => shared_dof%data)
967  type is (integer)
968  j = shared_dof%size()
969  do i = 1, j
970  gs%shared_dof_gs(i) = dof_array(i)
971  end do
972  end select
973  call shared_dof%free()
974 
975  ! Add shared dofs on faces
976 
977  ! We should use the %array() procedure, which works great for
978  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
979  ! certain data types
980  select type(dof_array => shared_face_dof%data)
981  type is (integer)
982  do i = 1, shared_face_dof%size()
983  gs%shared_dof_gs(i + j) = dof_array(i)
984  end do
985  end select
986  call shared_face_dof%free()
987 
988  ! Finalize shared gather-scatter index to dof
989  allocate(gs%shared_gs_dof(gs%nshared))
990 
991  ! Add dofs on points and edges
992 
993  ! We should use the %array() procedure, which works great for
994  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
995  ! certain data types
996  select type(dof_array => dof_shared%data)
997  type is(integer)
998  j = dof_shared%size()
999  do i = 1, j
1000  gs%shared_gs_dof(i) = dof_array(i)
1001  end do
1002  end select
1003  call dof_shared%free()
1004 
1005  ! We should use the %array() procedure, which works great for
1006  ! GNU, Intel and NEC, but it breaks horribly on Cray when using
1007  ! certain data types
1008  select type(dof_array => face_dof_shared%data)
1009  type is (integer)
1010  do i = 1, face_dof_shared%size()
1011  gs%shared_gs_dof(i + j) = dof_array(i)
1012  end do
1013  end select
1014  call face_dof_shared%free()
1015 
1016  ! Allocate buffer for shared gs-ops
1017  allocate(gs%shared_gs(gs%nshared))
1018 
1019  if (gs%nshared .gt. 0) then
1020  call gs_qsort_dofmap(gs%shared_dof_gs, gs%shared_gs_dof, &
1021  gs%nshared, 1, gs%nshared)
1022 
1023  call gs_find_blks(gs%shared_dof_gs, gs%shared_blk_len, &
1024  gs%nshared_blks, gs%nshared, gs%shared_facet_offset)
1025  end if
1026 
1027  contains
1028 
1030  function gs_mapping_add_dof(map_, dof, max_id) result(id)
1031  type(htable_i8_t), intent(inout) :: map_
1032  integer(kind=i8), intent(inout) :: dof
1033  integer, intent(inout) :: max_id
1034  integer :: id
1035 
1036  if (map_%get(dof, id) .gt. 0) then
1037  max_id = max_id + 1
1038  call map_%set(dof, max_id)
1039  id = max_id
1040  end if
1041 
1042  end function gs_mapping_add_dof
1043 
1045  recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
1046  integer, intent(inout) :: n
1047  integer, dimension(n), intent(inout) :: dg
1048  integer, dimension(n), intent(inout) :: gd
1049  integer :: lo, hi
1050  integer :: tmp, i, j, pivot
1051 
1052  i = lo - 1
1053  j = hi + 1
1054  pivot = dg((lo + hi) / 2)
1055  do
1056  do
1057  i = i + 1
1058  if (dg(i) .ge. pivot) exit
1059  end do
1060 
1061  do
1062  j = j - 1
1063  if (dg(j) .le. pivot) exit
1064  end do
1065 
1066  if (i .lt. j) then
1067  tmp = dg(i)
1068  dg(i) = dg(j)
1069  dg(j) = tmp
1070 
1071  tmp = gd(i)
1072  gd(i) = gd(j)
1073  gd(j) = tmp
1074  else if (i .eq. j) then
1075  i = i + 1
1076  exit
1077  else
1078  exit
1079  end if
1080  end do
1081  if (lo .lt. j) call gs_qsort_dofmap(dg, gd, n, lo, j)
1082  if (i .lt. hi) call gs_qsort_dofmap(dg, gd, n, i, hi)
1083 
1084  end subroutine gs_qsort_dofmap
1085 
1087  subroutine gs_find_blks(dg, blk_len, nblks, n, m)
1088  integer, intent(in) :: n
1089  integer, intent(in) :: m
1090  integer, dimension(n), intent(inout) :: dg
1091  integer, allocatable, intent(inout) :: blk_len(:)
1092  integer, intent(inout) :: nblks
1093  integer :: i, j
1094  integer :: id, count
1095  type(stack_i4_t), target :: blks
1096 
1097  call blks%init()
1098  i = 1
1099  do while( i .lt. m)
1100  id = dg(i)
1101  count = 1
1102  j = i
1103  do while ( j+1 .le. n .and. dg(j+1) .eq. id)
1104  j = j + 1
1105  count = count + 1
1106  end do
1107  call blks%push(count)
1108  i = j + 1
1109  end do
1110 
1111  select type(blk_array => blks%data)
1112  type is(integer)
1113  nblks = blks%size()
1114  allocate(blk_len(nblks))
1115  do i = 1, nblks
1116  blk_len(i) = blk_array(i)
1117  end do
1118  end select
1119  call blks%free()
1120 
1121  end subroutine gs_find_blks
1122 
1123  end subroutine gs_init_mapping
1124 
1126  subroutine gs_schedule(gs)
1127  type(gs_t), target, intent(inout) :: gs
1128  integer(kind=i8), allocatable :: send_buf(:), recv_buf(:)
1129  integer(kind=i2), allocatable :: shared_flg(:), recv_flg(:)
1130  type(htable_iter_i8_t) :: it
1131  type(stack_i4_t) :: send_pe, recv_pe
1132  type(mpi_status) :: status
1133  type(mpi_request) :: send_req, recv_req
1134  integer :: i, j, max_recv, src, dst, ierr, n_recv
1135  integer :: tmp, shared_gs_id
1136  integer :: nshared_unique
1137 
1138  nshared_unique = gs%shared_dofs%num_entries()
1139 
1140  call it%init(gs%shared_dofs)
1141  allocate(send_buf(nshared_unique))
1142  i = 1
1143  do while(it%next())
1144  send_buf(i) = it%key()
1145  i = i + 1
1146  end do
1147 
1148  call send_pe%init()
1149  call recv_pe%init()
1150 
1151 
1152  !
1153  ! Schedule exchange of shared dofs
1154  !
1155 
1156  call mpi_allreduce(nshared_unique, max_recv, 1, &
1157  mpi_integer, mpi_max, neko_comm, ierr)
1158 
1159  allocate(recv_buf(max_recv))
1160  allocate(shared_flg(max_recv))
1161  allocate(recv_flg(max_recv))
1162 
1164  do i = 1, size(gs%dofmap%msh%neigh_order)
1165  src = modulo(pe_rank - gs%dofmap%msh%neigh_order(i) + pe_size, pe_size)
1166  dst = modulo(pe_rank + gs%dofmap%msh%neigh_order(i), pe_size)
1167 
1168  if (gs%dofmap%msh%neigh(src)) then
1169  call mpi_irecv(recv_buf, max_recv, mpi_integer8, &
1170  src, 0, neko_comm, recv_req, ierr)
1171  end if
1172 
1173  if (gs%dofmap%msh%neigh(dst)) then
1174  call mpi_isend(send_buf, nshared_unique, mpi_integer8, &
1175  dst, 0, neko_comm, send_req, ierr)
1176  end if
1177 
1178  if (gs%dofmap%msh%neigh(src)) then
1179  call mpi_wait(recv_req, status, ierr)
1180  call mpi_get_count(status, mpi_integer8, n_recv, ierr)
1181 
1182  do j = 1, n_recv
1183  shared_flg(j) = gs%shared_dofs%get(recv_buf(j), shared_gs_id)
1184  if (shared_flg(j) .eq. 0) then
1186  call gs%comm%recv_dof(src)%push(shared_gs_id)
1187  end if
1188  end do
1189 
1190  if (gs%comm%recv_dof(src)%size() .gt. 0) then
1191  call recv_pe%push(src)
1192  end if
1193  end if
1194 
1195  if (gs%dofmap%msh%neigh(dst)) then
1196  call mpi_wait(send_req, mpi_status_ignore, ierr)
1197  call mpi_irecv(recv_flg, max_recv, mpi_integer2, &
1198  dst, 0, neko_comm, recv_req, ierr)
1199  end if
1200 
1201  if (gs%dofmap%msh%neigh(src)) then
1202  call mpi_isend(shared_flg, n_recv, mpi_integer2, &
1203  src, 0, neko_comm, send_req, ierr)
1204  end if
1205 
1206  if (gs%dofmap%msh%neigh(dst)) then
1207  call mpi_wait(recv_req, status, ierr)
1208  call mpi_get_count(status, mpi_integer2, n_recv, ierr)
1209 
1210  do j = 1, n_recv
1211  if (recv_flg(j) .eq. 0) then
1212  tmp = gs%shared_dofs%get(send_buf(j), shared_gs_id)
1214  call gs%comm%send_dof(dst)%push(shared_gs_id)
1215  end if
1216  end do
1217 
1218  if (gs%comm%send_dof(dst)%size() .gt. 0) then
1219  call send_pe%push(dst)
1220  end if
1221  end if
1222 
1223  if (gs%dofmap%msh%neigh(src)) then
1224  call mpi_wait(send_req, mpi_status_ignore, ierr)
1225  end if
1226 
1227  end do
1228 
1229  call gs%comm%init(send_pe, recv_pe)
1230 
1231  call send_pe%free()
1232  call recv_pe%free()
1233 
1234  deallocate(send_buf)
1235  deallocate(recv_flg)
1236  deallocate(shared_flg)
1237  !This arrays seems to take massive amounts of memory...
1238  call gs%shared_dofs%free()
1239 
1240  end subroutine gs_schedule
1241 
1243  subroutine gs_op_fld(gs, u, op, event)
1244  class(gs_t), intent(inout) :: gs
1245  type(field_t), intent(inout) :: u
1246  type(c_ptr), optional, intent(inout) :: event
1247  integer :: n, op
1248 
1249  n = u%msh%nelv * u%Xh%lx * u%Xh%ly * u%Xh%lz
1250  if (present(event)) then
1251  call gs_op_vector(gs, u%x, n, op, event)
1252  else
1253  call gs_op_vector(gs, u%x, n, op)
1254  end if
1255 
1256  end subroutine gs_op_fld
1257 
1259  subroutine gs_op_r4(gs, u, n, op, event)
1260  class(gs_t), intent(inout) :: gs
1261  integer, intent(in) :: n
1262  real(kind=rp), contiguous, dimension(:,:,:,:), intent(inout) :: u
1263  type(c_ptr), optional, intent(inout) :: event
1264  integer :: op
1265 
1266  if (present(event)) then
1267  call gs_op_vector(gs, u, n, op, event)
1268  else
1269  call gs_op_vector(gs, u, n, op)
1270  end if
1271 
1272  end subroutine gs_op_r4
1273 
1275  subroutine gs_op_vector(gs, u, n, op, event)
1276  class(gs_t), intent(inout) :: gs
1277  integer, intent(in) :: n
1278  real(kind=rp), dimension(n), intent(inout) :: u
1279  type(c_ptr), optional, intent(inout) :: event
1280  integer :: m, l, op, lo, so
1281 
1282  lo = gs%local_facet_offset
1283  so = -gs%shared_facet_offset
1284  m = gs%nlocal
1285  l = gs%nshared
1286 
1287  call profiler_start_region("gather_scatter", 5)
1288  ! Gather shared dofs
1289  if (pe_size .gt. 1) then
1290  call profiler_start_region("gs_nbrecv", 13)
1291  call gs%comm%nbrecv()
1292  call profiler_end_region("gs_nbrecv", 13)
1293  call profiler_start_region("gs_gather_shared", 14)
1294  call gs%bcknd%gather(gs%shared_gs, l, so, gs%shared_dof_gs, u, n, &
1295  gs%shared_gs_dof, gs%nshared_blks, gs%shared_blk_len, op, .true.)
1296  call profiler_end_region("gs_gather_shared", 14)
1297  call profiler_start_region("gs_nbsend", 6)
1298  call gs%comm%nbsend(gs%shared_gs, l, &
1299  gs%bcknd%gather_event, gs%bcknd%gs_stream)
1300  call profiler_end_region("gs_nbsend", 6)
1301 
1302  end if
1303 
1304  ! Gather-scatter local dofs
1305  call profiler_start_region("gs_local", 12)
1306  call gs%bcknd%gather(gs%local_gs, m, lo, gs%local_dof_gs, u, n, &
1307  gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, op, .false.)
1308  call gs%bcknd%scatter(gs%local_gs, m, gs%local_dof_gs, u, n, &
1309  gs%local_gs_dof, gs%nlocal_blks, gs%local_blk_len, .false., c_null_ptr)
1310  call profiler_end_region("gs_local", 12)
1311  ! Scatter shared dofs
1312  if (pe_size .gt. 1) then
1313  call profiler_start_region("gs_nbwait", 7)
1314  call gs%comm%nbwait(gs%shared_gs, l, op, gs%bcknd%gs_stream)
1315  call profiler_end_region("gs_nbwait", 7)
1316  call profiler_start_region("gs_scatter_shared", 15)
1317  if (present(event)) then
1318  call gs%bcknd%scatter(gs%shared_gs, l,&
1319  gs%shared_dof_gs, u, n, &
1320  gs%shared_gs_dof, gs%nshared_blks, &
1321  gs%shared_blk_len, .true., event)
1322  else
1323  call gs%bcknd%scatter(gs%shared_gs, l,&
1324  gs%shared_dof_gs, u, n, &
1325  gs%shared_gs_dof, gs%nshared_blks, &
1326  gs%shared_blk_len, .true., c_null_ptr)
1327  end if
1328  call profiler_end_region("gs_scatter_shared", 15)
1329  end if
1330 
1331  call profiler_end_region("gather_scatter", 5)
1332 
1333  end subroutine gs_op_vector
1334 
1335 end module gather_scatter
subroutine gs_find_blks(dg, blk_len, nblks, n, m)
Find blocks sharing dofs in non-facet data.
recursive subroutine gs_qsort_dofmap(dg, gd, n, lo, hi)
Sort the dof lists based on the dof to gather-scatter list.
integer function gs_mapping_add_dof(map_, dof, max_id)
Register a unique dof.
Deassociate a Fortran array from a device pointer.
Definition: device.F90:75
Map a Fortran array to a device (allocate and associate)
Definition: device.F90:57
Copy data between host and device (or device and device)
Definition: device.F90:51
Synchronize a device or stream.
Definition: device.F90:87
Definition: comm.F90:1
integer pe_rank
MPI rank.
Definition: comm.F90:26
type(mpi_comm) neko_comm
MPI communicator.
Definition: comm.F90:16
integer pe_size
MPI size of communicator.
Definition: comm.F90:29
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:185
Defines a mapping of the degrees of freedom.
Definition: dofmap.f90:35
Defines a field.
Definition: field.f90:34
Gather-scatter.
subroutine gs_init_mapping(gs)
Setup mapping of dofs to gather-scatter operations.
subroutine gs_init(gs, dofmap, bcknd)
Initialize a gather-scatter kernel.
subroutine gs_schedule(gs)
Schedule shared gather-scatter operations.
subroutine gs_free(gs)
Deallocate a gather-scatter kernel.
subroutine gs_op_r4(gs, u, n, op, event)
Gather-scatter operation on a rank 4 array.
subroutine gs_op_vector(gs, u, n, op, event)
Gather-scatter operation on a vector u with op op.
subroutine gs_op_fld(gs, u, op, event)
Gather-scatter operation on a field u with op op.
Defines a gather-scatter backend.
Definition: gs_bcknd.f90:34
integer, parameter, public gs_bcknd_cpu
Definition: gs_bcknd.f90:40
integer, parameter, public gs_bcknd_sx
Definition: gs_bcknd.f90:40
integer, parameter, public gs_bcknd_dev
Definition: gs_bcknd.f90:40
Defines a gather-scatter communication method.
Definition: gs_comm.f90:34
Generic Gather-scatter backend for CPUs.
Definition: gs_cpu.f90:34
Defines GPU aware MPI gather-scatter communication.
Generic Gather-scatter backend for accelerators.
Definition: gs_device.F90:34
Defines MPI gather-scatter communication.
Definition: gs_mpi.f90:34
Defines Gather-scatter operations.
Definition: gs_ops.f90:34
integer, parameter, public gs_op_add
Definition: gs_ops.f90:36
integer, parameter, public gs_op_max
Definition: gs_ops.f90:36
integer, parameter, public gs_op_min
Definition: gs_ops.f90:36
integer, parameter, public gs_op_mul
Definition: gs_ops.f90:36
Generic Gather-scatter backend for NEC Vector Engines.
Definition: gs_sx.f90:34
Implements a hash table ADT.
Definition: htable.f90:36
Logging routines.
Definition: log.f90:34
type(log_t), public neko_log
Global log stream.
Definition: log.f90:61
integer, parameter, public log_size
Definition: log.f90:40
Defines a mesh.
Definition: mesh.f90:34
Build configurations.
Definition: neko_config.f90:34
integer, parameter neko_bcknd_sx
Definition: neko_config.f90:39
integer, parameter neko_bcknd_hip
Definition: neko_config.f90:42
integer, parameter neko_bcknd_device
Definition: neko_config.f90:44
integer, parameter neko_bcknd_opencl
Definition: neko_config.f90:43
logical, parameter neko_device_mpi
Definition: neko_config.f90:46
integer, parameter neko_bcknd_cuda
Definition: neko_config.f90:41
integer, parameter, public i2
Definition: num_types.f90:5
integer, parameter, public dp
Definition: num_types.f90:9
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Profiling interface.
Definition: profiler.F90:34
subroutine, public profiler_start_region(name, region_id)
Started a named (name) profiler region.
Definition: profiler.F90:78
subroutine, public profiler_end_region(name, region_id)
End the most recently started profiler region.
Definition: profiler.F90:109
Implements a dynamic stack ADT.
Definition: stack.f90:35
Utilities.
Definition: utils.f90:35
pure integer function, public linear_index(i, j, k, l, lx, ly, lz)
Compute the address of a (i,j,k,l) array with sizes (1:lx, 1:ly, 1:lz, :)
Definition: utils.f90:130
Gather-scatter backend.
Definition: gs_bcknd.f90:44
Gather-scatter communication method.
Definition: gs_comm.f90:45
Gather-scatter backend for CPUs.
Definition: gs_cpu.f90:43
Gather-scatter backend for offloading devices.
Definition: gs_device.F90:47
Gather-scatter communication using device MPI. The arrays are indexed per PE like send_pe and @ recv_...
Gather-scatter communication using MPI.
Definition: gs_mpi.f90:54
Gather-scatter backend for NEC SX-Aurora.
Definition: gs_sx.f90:43
Integer*8 based hash table.
Definition: htable.f90:92
Iterator for an integer*8 based hash table.
Definition: htable.f90:175
Integer based stack.
Definition: stack.f90:63