Neko  0.9.99
A portable framework for high-order spectral element flow simulations
gs_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2022, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
34 module gs_device
35  use neko_config
36  use num_types
37  use gs_bcknd
38  use device
39  use gs_ops
40  use utils
41  use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
42  c_associated
43  implicit none
44  private
45 
47  type, public, extends(gs_bcknd_t) :: gs_device_t
48  integer, allocatable :: local_blk_off(:)
49  integer, allocatable :: shared_blk_off(:)
50  type(c_ptr) :: local_gs_d = c_null_ptr
51  type(c_ptr) :: local_dof_gs_d = c_null_ptr
52  type(c_ptr) :: local_gs_dof_d = c_null_ptr
53  type(c_ptr) :: shared_gs_d = c_null_ptr
54  type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55  type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56  type(c_ptr) :: local_blk_len_d = c_null_ptr
57  type(c_ptr) :: shared_blk_len_d = c_null_ptr
58  type(c_ptr) :: local_blk_off_d = c_null_ptr
59  type(c_ptr) :: shared_blk_off_d = c_null_ptr
60  integer :: nlocal
61  integer :: nshared
62  logical :: shared_on_host
63  contains
64  procedure, pass(this) :: init => gs_device_init
65  procedure, pass(this) :: free => gs_device_free
66  procedure, pass(this) :: gather => gs_gather_device
67  procedure, pass(this) :: scatter => gs_scatter_device
68  end type gs_device_t
69 
70 #ifdef HAVE_HIP
71  interface
72  subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73  bind(c, name='hip_gather_kernel')
74  use, intrinsic :: iso_c_binding
75  implicit none
76  integer(c_int) :: m, n, nb, o, op
77  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
78  end subroutine hip_gather_kernel
79  end interface
80 
81  interface
82  subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
83  bind(c, name='hip_scatter_kernel')
84  use, intrinsic :: iso_c_binding
85  implicit none
86  integer(c_int) :: m, n, nb
87  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
88  end subroutine hip_scatter_kernel
89  end interface
90 
91 #elif HAVE_CUDA
92  interface
93  subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94  bind(c, name='cuda_gather_kernel')
95  use, intrinsic :: iso_c_binding
96  implicit none
97  integer(c_int) :: m, n, nb, o, op
98  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
99  end subroutine cuda_gather_kernel
100  end interface
101 
102  interface
103  subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
104  bind(c, name='cuda_scatter_kernel')
105  use, intrinsic :: iso_c_binding
106  implicit none
107  integer(c_int) :: m, n, nb
108  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
109  end subroutine cuda_scatter_kernel
110  end interface
111 #elif HAVE_OPENCL
112  interface
113  subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114  bind(c, name='opencl_gather_kernel')
115  use, intrinsic :: iso_c_binding
116  implicit none
117  integer(c_int) :: m, n, nb, o, op
118  type(c_ptr), value :: v, u, dg, gd, b, bo
119  end subroutine opencl_gather_kernel
120  end interface
121 
122  interface
123  subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo) &
124  bind(c, name='opencl_scatter_kernel')
125  use, intrinsic :: iso_c_binding
126  implicit none
127  integer(c_int) :: m, n, nb
128  type(c_ptr), value :: v, u, dg, gd, b, bo
129  end subroutine opencl_scatter_kernel
130  end interface
131 #endif
132 
133 contains
134 
136  subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
137  class(gs_device_t), intent(inout) :: this
138  integer, intent(in) :: nlocal
139  integer, intent(in) :: nshared
140  integer, intent(in) :: nlcl_blks
141  integer, intent(in) :: nshrd_blks
142 
143  call this%free()
144 
145  this%nlocal = nlocal
146  this%nshared = nshared
147 
148  allocate(this%local_blk_off(nlcl_blks))
149  allocate(this%shared_blk_off(nshrd_blks))
150 
151  this%local_gs_d = c_null_ptr
152  this%local_dof_gs_d = c_null_ptr
153  this%local_gs_dof_d = c_null_ptr
154  this%local_blk_len_d = c_null_ptr
155  this%local_blk_off_d = c_null_ptr
156  this%shared_gs_d = c_null_ptr
157  this%shared_dof_gs_d = c_null_ptr
158  this%shared_gs_dof_d = c_null_ptr
159  this%shared_blk_len_d = c_null_ptr
160  this%shared_blk_off_d = c_null_ptr
161 
162  this%shared_on_host = .true.
163 
164 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
165  call device_event_create(this%gather_event, 2)
166  call device_event_create(this%scatter_event, 2)
167 #endif
168 
169  this%gs_stream = glb_cmd_queue
170 
171  end subroutine gs_device_init
172 
174  subroutine gs_device_free(this)
175  class(gs_device_t), intent(inout) :: this
176 
177  if (allocated(this%local_blk_off)) then
178  deallocate(this%local_blk_off)
179  end if
180 
181  if (allocated(this%shared_blk_off)) then
182  deallocate(this%shared_blk_off)
183  end if
184 
185  if (c_associated(this%local_gs_d)) then
186  call device_free(this%local_gs_d)
187  end if
188 
189  if (c_associated(this%local_dof_gs_d)) then
190  call device_free(this%local_dof_gs_d)
191  end if
192 
193  if (c_associated(this%local_gs_dof_d)) then
194  call device_free(this%local_gs_dof_d)
195  end if
196 
197  if (c_associated(this%local_blk_len_d)) then
198  call device_free(this%local_blk_len_d)
199  end if
200 
201  if (c_associated(this%shared_blk_len_d)) then
202  call device_free(this%shared_blk_len_d)
203  end if
204 
205  if (c_associated(this%local_blk_off_d)) then
206  call device_free(this%local_blk_off_d)
207  end if
208 
209  if (c_associated(this%shared_blk_off_d)) then
210  call device_free(this%shared_blk_off_d)
211  end if
212 
213  this%nlocal = 0
214  this%nshared = 0
215 
216 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
217  if (c_associated(this%gather_event)) then
218  call device_event_destroy(this%gather_event)
219  end if
220 
221  if (c_associated(this%scatter_event)) then
222  call device_event_destroy(this%scatter_event)
223  end if
224 #endif
225 
226  if (c_associated(this%gs_stream)) then
227  this%gs_stream = c_null_ptr
228  end if
229 
230  end subroutine gs_device_free
231 
233  subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
234  integer, intent(in) :: m
235  integer, intent(in) :: n
236  integer, intent(in) :: nb
237  class(gs_device_t), intent(inout) :: this
238  real(kind=rp), dimension(m), intent(inout) :: v
239  integer, dimension(m), intent(inout) :: dg
240  real(kind=rp), dimension(n), intent(inout) :: u
241  integer, dimension(m), intent(inout) :: gd
242  integer, dimension(nb), intent(inout) :: b
243  integer, intent(in) :: o
244  integer, intent(in) :: op
245  logical, intent(in) :: shrd
246  integer :: i
247  type(c_ptr) :: u_d
248 
249  u_d = device_get_ptr(u)
250 
251  if (.not. shrd) then
252  associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
253  gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
254  bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
255  strm=>this%gs_stream)
256 
257  if (.not. c_associated(v_d)) then
258  call device_map(v, v_d, m)
259  end if
260 
261  if (.not. c_associated(dg_d)) then
262  call device_map(dg, dg_d, m)
263  call device_memcpy(dg, dg_d, m, host_to_device, &
264  sync=.false., strm=strm)
265  end if
266 
267  if (.not. c_associated(gd_d)) then
268  call device_map(gd, gd_d, m)
269  call device_memcpy(gd, gd_d, m, host_to_device, &
270  sync=.false., strm=strm)
271  end if
272 
273  if (.not. c_associated(b_d)) then
274  call device_map(b, b_d, nb)
275  call device_memcpy(b, b_d, nb, host_to_device, &
276  sync=.false., strm=strm)
277  end if
278 
279  if (.not. c_associated(bo_d)) then
280  call device_map(bo, bo_d, nb)
281  bo(1) = 0
282  do i = 2, nb
283  bo(i) = bo(i - 1) + b(i - 1)
284  end do
285  call device_memcpy(bo, bo_d, nb, host_to_device, &
286  sync=.false., strm=strm)
287  end if
288 
289 #ifdef HAVE_HIP
290  call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
291  nb, b_d, bo_d, op, strm)
292 #elif HAVE_CUDA
293  call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
294  nb, b_d, bo_d, op, strm)
295 #elif HAVE_OPENCL
296  call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
297  nb, b_d, bo_d, op)
298 #else
299  call neko_error('No device backend configured')
300 #endif
301 
302  end associate
303  else if (shrd) then
304  associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
305  gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
306  bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
307  strm=>this%gs_stream)
308 
309  if (.not. c_associated(v_d)) then
310  call device_map(v, v_d, m)
311  end if
312 
313  if (.not. c_associated(dg_d)) then
314  call device_map(dg, dg_d, m)
315  call device_memcpy(dg, dg_d, m, host_to_device, &
316  sync=.false., strm=strm)
317  end if
318 
319  if (.not. c_associated(gd_d)) then
320  call device_map(gd, gd_d, m)
321  call device_memcpy(gd, gd_d, m, host_to_device, &
322  sync=.false., strm=strm)
323  end if
324 
325  if (.not. c_associated(b_d)) then
326  call device_map(b, b_d, nb)
327  call device_memcpy(b, b_d, nb, host_to_device, &
328  sync=.false., strm=strm)
329  end if
330 
331  if (.not. c_associated(bo_d)) then
332  call device_map(bo, bo_d, nb)
333  bo(1) = 0
334  do i = 2, nb
335  bo(i) = bo(i - 1) + b(i - 1)
336  end do
337  call device_memcpy(bo, bo_d, nb, host_to_device, &
338  sync=.false., strm=strm)
339  end if
340 
341 
342 #ifdef HAVE_HIP
343  call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
344  nb, b_d, bo_d, op, strm)
345 #elif HAVE_CUDA
346  call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
347  nb, b_d, bo_d, op, strm)
348 #elif HAVE_OPENCL
349  call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
350  nb, b_d, bo_d, op)
351 #else
352  call neko_error('No device backend configured')
353 #endif
354 
355 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
356  call device_event_record(this%gather_event, strm)
357 #endif
358 
359  if (this%shared_on_host) then
360  if (this%nshared .eq. m) then
361  call device_memcpy(v, v_d, m, device_to_host, &
362  sync=.true., strm=strm)
363  end if
364  end if
365 
366  end associate
367  end if
368 
369  end subroutine gs_gather_device
370 
372  subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
373  integer, intent(in) :: m
374  integer, intent(in) :: n
375  integer, intent(in) :: nb
376  class(gs_device_t), intent(inout) :: this
377  real(kind=rp), dimension(m), intent(inout) :: v
378  integer, dimension(m), intent(inout) :: dg
379  real(kind=rp), dimension(n), intent(inout) :: u
380  integer, dimension(m), intent(inout) :: gd
381  integer, dimension(nb), intent(inout) :: b
382  logical, intent(in) :: shrd
383  type(c_ptr) :: event
384  type(c_ptr) :: u_d
385 
386  u_d = device_get_ptr(u)
387 
388  if (.not. shrd) then
389  associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
390  gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
391  bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
392 #ifdef HAVE_HIP
393  call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
394 #elif HAVE_CUDA
395  call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
396 #elif HAVE_OPENCL
397  call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
398 #else
399  call neko_error('No device backend configured')
400 #endif
401  end associate
402  else if (shrd) then
403  associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
404  gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
405  bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
406 
407  if (this%shared_on_host) then
408  call device_memcpy(v, v_d, m, host_to_device, &
409  sync=.false., strm=strm)
410  end if
411 
412 #ifdef HAVE_HIP
413  call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
414 #elif HAVE_CUDA
415  call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
416 #elif HAVE_OPENCL
417  call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
418 #else
419  call neko_error('No device backend configured')
420 #endif
421 
422 #if defined(HAVE_HIP) || defined(HAVE_CUDA)
423  if (c_associated(event)) then
424  call device_event_record(event, strm)
425  else
426  call device_sync(strm)
427  end if
428 #endif
429 
430  end associate
431  end if
432 
433  end subroutine gs_scatter_device
434 
435 end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
Definition: gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
Definition: gs.c:166
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition: gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition: gs.cu:96
Return the device pointer for an associated Fortran array.
Definition: device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition: device.F90:57
Copy data between host and device (or device and device)
Definition: device.F90:51
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition: device.F90:1210
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:185
integer, parameter, public device_to_host
Definition: device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition: device.F90:1194
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition: device.F90:1164
Defines a gather-scatter backend.
Definition: gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition: gs_device.F90:34
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
Definition: gs_device.F90:234
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
Definition: gs_device.F90:373
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
Definition: gs_device.F90:137
subroutine gs_device_free(this)
Dummy backend deallocation.
Definition: gs_device.F90:175
Defines Gather-scatter operations.
Definition: gs_ops.f90:34
Build configurations.
Definition: neko_config.f90:34
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Utilities.
Definition: utils.f90:35
Gather-scatter backend.
Definition: gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition: gs_device.F90:47