42 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
49 type(c_ptr) :: local_gs_d = c_null_ptr
50 type(c_ptr) :: local_dof_gs_d = c_null_ptr
51 type(c_ptr) :: local_gs_dof_d = c_null_ptr
52 type(c_ptr) :: shared_gs_d = c_null_ptr
53 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
54 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
55 type(c_ptr) :: local_blk_len_d = c_null_ptr
56 type(c_ptr) :: shared_blk_len_d = c_null_ptr
57 type(c_ptr) :: local_blk_off_d = c_null_ptr
58 type(c_ptr) :: shared_blk_off_d = c_null_ptr
61 logical :: shared_on_host
71 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
72 bind(c, name =
'hip_gather_kernel')
73 use,
intrinsic :: iso_c_binding
75 integer(c_int) :: m, n, nb, o, op
76 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
82 bind(c, name =
'hip_scatter_kernel')
83 use,
intrinsic :: iso_c_binding
85 integer(c_int) :: m, n, nb
86 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
92 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
93 bind(c, name =
'cuda_gather_kernel')
94 use,
intrinsic :: iso_c_binding
96 integer(c_int) :: m, n, nb, o, op
97 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
103 bind(c, name =
'cuda_scatter_kernel')
104 use,
intrinsic :: iso_c_binding
106 integer(c_int) :: m, n, nb
107 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
112 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
113 strm) bind(c, name = 'opencl_gather_kernel')
114 use,
intrinsic :: iso_c_binding
116 integer(c_int) :: m, n, nb, o, op
117 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
123 bind(c, name =
'opencl_scatter_kernel')
124 use,
intrinsic :: iso_c_binding
126 integer(c_int) :: m, n, nb
127 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
137 integer,
intent(in) :: nlocal
138 integer,
intent(in) :: nshared
139 integer,
intent(in) :: nlcl_blks
140 integer,
intent(in) :: nshrd_blks
145 this%nshared = nshared
147 this%local_gs_d = c_null_ptr
148 this%local_dof_gs_d = c_null_ptr
149 this%local_gs_dof_d = c_null_ptr
150 this%local_blk_len_d = c_null_ptr
151 this%local_blk_off_d = c_null_ptr
152 this%shared_gs_d = c_null_ptr
153 this%shared_dof_gs_d = c_null_ptr
154 this%shared_gs_dof_d = c_null_ptr
155 this%shared_blk_len_d = c_null_ptr
156 this%shared_blk_off_d = c_null_ptr
158 this%shared_on_host = .true.
160#if defined(HAVE_HIP) || defined(HAVE_CUDA)
173 if (c_associated(this%local_gs_d))
then
177 if (c_associated(this%local_dof_gs_d))
then
181 if (c_associated(this%local_gs_dof_d))
then
185 if (c_associated(this%local_blk_len_d))
then
189 if (c_associated(this%shared_blk_len_d))
then
193 if (c_associated(this%local_blk_off_d))
then
197 if (c_associated(this%shared_blk_off_d))
then
204#if defined(HAVE_HIP) || defined(HAVE_CUDA)
205 if (c_associated(this%gather_event))
then
209 if (c_associated(this%scatter_event))
then
214 if (c_associated(this%gs_stream))
then
215 this%gs_stream = c_null_ptr
221 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
222 integer,
intent(in) :: m
223 integer,
intent(in) :: n
224 integer,
intent(in) :: nb
226 real(kind=
rp),
dimension(m),
intent(inout) :: v
227 integer,
dimension(m),
intent(inout) :: dg
228 real(kind=
rp),
dimension(n),
intent(inout) :: u
229 integer,
dimension(m),
intent(inout) :: gd
230 integer,
dimension(nb),
intent(inout) :: b
231 integer,
dimension(nb),
intent(inout) :: bo
232 integer,
intent(in) :: o
233 integer,
intent(in) :: op
234 logical,
intent(in) :: shrd
241 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
242 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
243 bo_d => this%local_blk_off_d, strm => this%gs_stream)
245 if (.not. c_associated(v_d))
then
248 real(c_rp) :: rp_dummy
249 integer(c_size_t) :: s
250 s = c_sizeof(rp_dummy) * m
255 if (.not. c_associated(dg_d))
then
258 sync = .false., strm = strm)
261 if (.not. c_associated(gd_d))
then
264 sync = .false., strm = strm)
268 if (.not. c_associated(b_d))
then
271 sync = .false., strm = strm)
274 if (.not. c_associated(bo_d))
then
277 sync = .false., strm = strm)
283 nb, b_d, bo_d, op, strm)
286 nb, b_d, bo_d, op, strm)
289 nb, b_d, bo_d, op, strm)
291 call neko_error(
'No device backend configured')
296 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
297 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
298 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
300 if (.not. c_associated(v_d))
then
303 real(c_rp) :: rp_dummy
304 integer(c_size_t) :: s
305 s = c_sizeof(rp_dummy) * m
310 if (.not. c_associated(dg_d))
then
311 call device_map(dg, dg_d, m)
312 call device_memcpy(dg, dg_d, m, host_to_device, &
313 sync = .false., strm = strm)
316 if (.not. c_associated(gd_d))
then
317 call device_map(gd, gd_d, m)
318 call device_memcpy(gd, gd_d, m, host_to_device, &
319 sync = .false., strm = strm)
323 if (.not. c_associated(b_d))
then
324 call device_map(b, b_d, nb)
325 call device_memcpy(b, b_d, nb, host_to_device, &
326 sync = .false., strm = strm)
329 if (.not. c_associated(bo_d))
then
330 call device_map(bo, bo_d, nb)
331 call device_memcpy(bo, bo_d, nb, host_to_device, &
332 sync = .false., strm = strm)
339 nb, b_d, bo_d, op, strm)
342 nb, b_d, bo_d, op, strm)
345 nb, b_d, bo_d, op, strm)
347 call neko_error(
'No device backend configured')
350#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
351 call device_event_record(this%gather_event, strm)
354 if (this%shared_on_host)
then
355 if (this%nshared .eq. m)
then
356 call device_memcpy(v, v_d, m, device_to_host, &
357 sync = .true., strm = strm)
367 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
368 integer,
intent(in) :: m
369 integer,
intent(in) :: n
370 integer,
intent(in) :: nb
372 real(kind=rp),
dimension(m),
intent(inout) :: v
373 integer,
dimension(m),
intent(inout) :: dg
374 real(kind=rp),
dimension(n),
intent(inout) :: u
375 integer,
dimension(m),
intent(inout) :: gd
376 integer,
dimension(nb),
intent(inout) :: b
377 integer,
dimension(nb),
intent(inout) :: bo
378 logical,
intent(in) :: shrd
382 u_d = device_get_ptr(u)
385 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
386 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
387 bo_d => this%local_blk_off_d, strm => this%gs_stream)
398 call neko_error(
'No device backend configured')
402 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
403 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
404 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
406 if (this%shared_on_host)
then
407 call device_memcpy(v, v_d, m, host_to_device, &
408 sync = .false., strm = strm)
421 call neko_error(
'No device backend configured')
426#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
427 if (c_associated(event))
then
428 call device_event_record(event, this%gs_stream)
430 call device_sync(this%gs_stream)
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Synchronize a device or stream.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
integer, parameter, public device_to_host
subroutine, public device_event_destroy(event)
Destroy a device event.
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
subroutine, public device_event_create(event, flags)
Create a device event queue.
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Defines a gather-scatter backend.
Generic Gather-scatter backend for accelerators.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
Gather kernel.
subroutine gs_device_free(this)
Dummy backend deallocation.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
Scatter kernel.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Gather-scatter backend for offloading devices.