Neko  0.9.0
A portable framework for high-order spectral element flow simulations
gs_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2022, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
34 module gs_device
35  use neko_config
36  use num_types
37  use gs_bcknd
38  use device
39  use gs_ops
40  use utils
41  use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
42  c_associated
43  implicit none
44  private
45 
47  type, public, extends(gs_bcknd_t) :: gs_device_t
48  integer, allocatable :: local_blk_off(:)
49  integer, allocatable :: shared_blk_off(:)
50  type(c_ptr) :: local_gs_d = c_null_ptr
51  type(c_ptr) :: local_dof_gs_d = c_null_ptr
52  type(c_ptr) :: local_gs_dof_d = c_null_ptr
53  type(c_ptr) :: shared_gs_d = c_null_ptr
54  type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55  type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56  type(c_ptr) :: local_blk_len_d = c_null_ptr
57  type(c_ptr) :: shared_blk_len_d = c_null_ptr
58  type(c_ptr) :: local_blk_off_d = c_null_ptr
59  type(c_ptr) :: shared_blk_off_d = c_null_ptr
60  integer :: nlocal
61  integer :: nshared
62  logical :: shared_on_host
63  contains
64  procedure, pass(this) :: init => gs_device_init
65  procedure, pass(this) :: free => gs_device_free
66  procedure, pass(this) :: gather => gs_gather_device
67  procedure, pass(this) :: scatter => gs_scatter_device
68  end type gs_device_t
69 
70 #ifdef HAVE_HIP
71  interface
72  subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73  bind(c, name='hip_gather_kernel')
74  use, intrinsic :: iso_c_binding
75  implicit none
76  integer(c_int) :: m, n, nb, o, op
77  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
78  end subroutine hip_gather_kernel
79  end interface
80 
81  interface
82  subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
83  bind(c, name='hip_scatter_kernel')
84  use, intrinsic :: iso_c_binding
85  implicit none
86  integer(c_int) :: m, n, nb
87  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
88  end subroutine hip_scatter_kernel
89  end interface
90 
91 #elif HAVE_CUDA
92  interface
93  subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94  bind(c, name='cuda_gather_kernel')
95  use, intrinsic :: iso_c_binding
96  implicit none
97  integer(c_int) :: m, n, nb, o, op
98  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
99  end subroutine cuda_gather_kernel
100  end interface
101 
102  interface
103  subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
104  bind(c, name='cuda_scatter_kernel')
105  use, intrinsic :: iso_c_binding
106  implicit none
107  integer(c_int) :: m, n, nb
108  type(c_ptr), value :: v, u, dg, gd, b, bo, strm
109  end subroutine cuda_scatter_kernel
110  end interface
111 #elif HAVE_OPENCL
112  interface
113  subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114  bind(c, name='opencl_gather_kernel')
115  use, intrinsic :: iso_c_binding
116  implicit none
117  integer(c_int) :: m, n, nb, o, op
118  type(c_ptr), value :: v, u, dg, gd, b, bo
119  end subroutine opencl_gather_kernel
120  end interface
121 
122  interface
123  subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo) &
124  bind(c, name='opencl_scatter_kernel')
125  use, intrinsic :: iso_c_binding
126  implicit none
127  integer(c_int) :: m, n, nb
128  type(c_ptr), value :: v, u, dg, gd, b, bo
129  end subroutine opencl_scatter_kernel
130  end interface
131 #endif
132 
133 contains
134 
136  subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
137  class(gs_device_t), intent(inout) :: this
138  integer, intent(in) :: nlocal
139  integer, intent(in) :: nshared
140  integer, intent(in) :: nlcl_blks
141  integer, intent(in) :: nshrd_blks
142 
143  call this%free()
144 
145  this%nlocal = nlocal
146  this%nshared = nshared
147 
148  allocate(this%local_blk_off(nlcl_blks))
149  allocate(this%shared_blk_off(nshrd_blks))
150 
151  this%local_gs_d = c_null_ptr
152  this%local_dof_gs_d = c_null_ptr
153  this%local_gs_dof_d = c_null_ptr
154  this%local_blk_len_d = c_null_ptr
155  this%local_blk_off_d = c_null_ptr
156  this%shared_gs_d = c_null_ptr
157  this%shared_dof_gs_d = c_null_ptr
158  this%shared_gs_dof_d = c_null_ptr
159  this%shared_blk_len_d = c_null_ptr
160  this%shared_blk_off_d = c_null_ptr
161 
162  this%shared_on_host = .true.
163 
164  call device_event_create(this%gather_event, 2)
165  call device_event_create(this%scatter_event, 2)
166 
167  this%gs_stream = glb_cmd_queue
168 
169  end subroutine gs_device_init
170 
172  subroutine gs_device_free(this)
173  class(gs_device_t), intent(inout) :: this
174 
175  if (allocated(this%local_blk_off)) then
176  deallocate(this%local_blk_off)
177  end if
178 
179  if (allocated(this%shared_blk_off)) then
180  deallocate(this%shared_blk_off)
181  end if
182 
183  if (c_associated(this%local_gs_d)) then
184  call device_free(this%local_gs_d)
185  end if
186 
187  if (c_associated(this%local_dof_gs_d)) then
188  call device_free(this%local_dof_gs_d)
189  end if
190 
191  if (c_associated(this%local_gs_dof_d)) then
192  call device_free(this%local_gs_dof_d)
193  end if
194 
195  if (c_associated(this%local_blk_len_d)) then
196  call device_free(this%local_blk_len_d)
197  end if
198 
199  if (c_associated(this%shared_blk_len_d)) then
200  call device_free(this%shared_blk_len_d)
201  end if
202 
203  if (c_associated(this%local_blk_off_d)) then
204  call device_free(this%local_blk_off_d)
205  end if
206 
207  if (c_associated(this%shared_blk_off_d)) then
208  call device_free(this%shared_blk_off_d)
209  end if
210 
211  this%nlocal = 0
212  this%nshared = 0
213 
214  if (c_associated(this%gather_event)) then
215  call device_event_destroy(this%gather_event)
216  end if
217 
218  if (c_associated(this%scatter_event)) then
219  call device_event_destroy(this%scatter_event)
220  end if
221 
222  if (c_associated(this%gs_stream)) then
223  this%gs_stream = c_null_ptr
224  end if
225 
226  end subroutine gs_device_free
227 
229  subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
230  integer, intent(in) :: m
231  integer, intent(in) :: n
232  integer, intent(in) :: nb
233  class(gs_device_t), intent(inout) :: this
234  real(kind=rp), dimension(m), intent(inout) :: v
235  integer, dimension(m), intent(inout) :: dg
236  real(kind=rp), dimension(n), intent(inout) :: u
237  integer, dimension(m), intent(inout) :: gd
238  integer, dimension(nb), intent(inout) :: b
239  integer, intent(in) :: o
240  integer, intent(in) :: op
241  logical, intent(in) :: shrd
242  integer :: i
243  type(c_ptr) :: u_d
244 
245  u_d = device_get_ptr(u)
246 
247  if (.not. shrd) then
248  associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
249  gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
250  bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
251  strm=>this%gs_stream)
252 
253  if (.not. c_associated(v_d)) then
254  call device_map(v, v_d, m)
255  end if
256 
257  if (.not. c_associated(dg_d)) then
258  call device_map(dg, dg_d, m)
259  call device_memcpy(dg, dg_d, m, host_to_device, &
260  sync=.false., strm=strm)
261  end if
262 
263  if (.not. c_associated(gd_d)) then
264  call device_map(gd, gd_d, m)
265  call device_memcpy(gd, gd_d, m, host_to_device, &
266  sync=.false., strm=strm)
267  end if
268 
269  if (.not. c_associated(b_d)) then
270  call device_map(b, b_d, nb)
271  call device_memcpy(b, b_d, nb, host_to_device, &
272  sync=.false., strm=strm)
273  end if
274 
275  if (.not. c_associated(bo_d)) then
276  call device_map(bo, bo_d, nb)
277  bo(1) = 0
278  do i = 2, nb
279  bo(i) = bo(i - 1) + b(i - 1)
280  end do
281  call device_memcpy(bo, bo_d, nb, host_to_device, &
282  sync=.false., strm=strm)
283  end if
284 
285 #ifdef HAVE_HIP
286  call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
287  nb, b_d, bo_d, op, strm)
288 #elif HAVE_CUDA
289  call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
290  nb, b_d, bo_d, op, strm)
291 #elif HAVE_OPENCL
292  call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
293  nb, b_d, bo_d, op)
294 #else
295  call neko_error('No device backend configured')
296 #endif
297 
298  end associate
299  else if (shrd) then
300  associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
301  gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
302  bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
303  strm=>this%gs_stream)
304 
305  if (.not. c_associated(v_d)) then
306  call device_map(v, v_d, m)
307  end if
308 
309  if (.not. c_associated(dg_d)) then
310  call device_map(dg, dg_d, m)
311  call device_memcpy(dg, dg_d, m, host_to_device, &
312  sync=.false., strm=strm)
313  end if
314 
315  if (.not. c_associated(gd_d)) then
316  call device_map(gd, gd_d, m)
317  call device_memcpy(gd, gd_d, m, host_to_device, &
318  sync=.false., strm=strm)
319  end if
320 
321  if (.not. c_associated(b_d)) then
322  call device_map(b, b_d, nb)
323  call device_memcpy(b, b_d, nb, host_to_device, &
324  sync=.false., strm=strm)
325  end if
326 
327  if (.not. c_associated(bo_d)) then
328  call device_map(bo, bo_d, nb)
329  bo(1) = 0
330  do i = 2, nb
331  bo(i) = bo(i - 1) + b(i - 1)
332  end do
333  call device_memcpy(bo, bo_d, nb, host_to_device, &
334  sync=.false., strm=strm)
335  end if
336 
337 
338 #ifdef HAVE_HIP
339  call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
340  nb, b_d, bo_d, op, strm)
341 #elif HAVE_CUDA
342  call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
343  nb, b_d, bo_d, op, strm)
344 #elif HAVE_OPENCL
345  call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
346  nb, b_d, bo_d, op)
347 #else
348  call neko_error('No device backend configured')
349 #endif
350 
351  call device_event_record(this%gather_event, strm)
352 
353  if (this%shared_on_host) then
354  if (this%nshared .eq. m) then
355  call device_memcpy(v, v_d, m, device_to_host, &
356  sync=.true., strm=strm)
357  end if
358  end if
359 
360  end associate
361  end if
362 
363  end subroutine gs_gather_device
364 
366  subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
367  integer, intent(in) :: m
368  integer, intent(in) :: n
369  integer, intent(in) :: nb
370  class(gs_device_t), intent(inout) :: this
371  real(kind=rp), dimension(m), intent(inout) :: v
372  integer, dimension(m), intent(inout) :: dg
373  real(kind=rp), dimension(n), intent(inout) :: u
374  integer, dimension(m), intent(inout) :: gd
375  integer, dimension(nb), intent(inout) :: b
376  logical, intent(in) :: shrd
377  type(c_ptr) :: event
378  type(c_ptr) :: u_d
379 
380  u_d = device_get_ptr(u)
381 
382  if (.not. shrd) then
383  associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
384  gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
385  bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
386 #ifdef HAVE_HIP
387  call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
388 #elif HAVE_CUDA
389  call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
390 #elif HAVE_OPENCL
391  call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
392 #else
393  call neko_error('No device backend configured')
394 #endif
395  end associate
396  else if (shrd) then
397  associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
398  gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
399  bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
400 
401  if (this%shared_on_host) then
402  call device_memcpy(v, v_d, m, host_to_device, &
403  sync=.false., strm=strm)
404  end if
405 
406 #ifdef HAVE_HIP
407  call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
408 #elif HAVE_CUDA
409  call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
410 #elif HAVE_OPENCL
411  call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
412 #else
413  call neko_error('No device backend configured')
414 #endif
415 
416  if (c_associated(event)) then
417  call device_event_record(event, strm)
418  else
419  call device_sync(strm)
420  end if
421 
422  end associate
423  end if
424 
425  end subroutine gs_scatter_device
426 
427 end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
Definition: gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
Definition: gs.c:166
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition: gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition: gs.cu:96
Return the device pointer for an associated Fortran array.
Definition: device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition: device.F90:57
Copy data between host and device (or device and device)
Definition: device.F90:51
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition: device.F90:1210
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:185
integer, parameter, public device_to_host
Definition: device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition: device.F90:1194
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition: device.F90:1164
Defines a gather-scatter backend.
Definition: gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition: gs_device.F90:34
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
Definition: gs_device.F90:230
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
Definition: gs_device.F90:367
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
Definition: gs_device.F90:137
subroutine gs_device_free(this)
Dummy backend deallocation.
Definition: gs_device.F90:173
Defines Gather-scatter operations.
Definition: gs_ops.f90:34
Build configurations.
Definition: neko_config.f90:34
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Utilities.
Definition: utils.f90:35
Gather-scatter backend.
Definition: gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition: gs_device.F90:47