42 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
49 integer,
allocatable :: local_blk_off(:)
50 integer,
allocatable :: shared_blk_off(:)
51 type(c_ptr) :: local_gs_d = c_null_ptr
52 type(c_ptr) :: local_dof_gs_d = c_null_ptr
53 type(c_ptr) :: local_gs_dof_d = c_null_ptr
54 type(c_ptr) :: shared_gs_d = c_null_ptr
55 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
56 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
57 type(c_ptr) :: local_blk_len_d = c_null_ptr
58 type(c_ptr) :: shared_blk_len_d = c_null_ptr
59 type(c_ptr) :: local_blk_off_d = c_null_ptr
60 type(c_ptr) :: shared_blk_off_d = c_null_ptr
63 logical :: shared_on_host
73 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
74 bind(c, name=
'hip_gather_kernel')
75 use,
intrinsic :: iso_c_binding
77 integer(c_int) :: m, n, nb, o, op
78 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
84 bind(c, name=
'hip_scatter_kernel')
85 use,
intrinsic :: iso_c_binding
87 integer(c_int) :: m, n, nb
88 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
94 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
95 bind(c, name=
'cuda_gather_kernel')
96 use,
intrinsic :: iso_c_binding
98 integer(c_int) :: m, n, nb, o, op
99 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
105 bind(c, name=
'cuda_scatter_kernel')
106 use,
intrinsic :: iso_c_binding
108 integer(c_int) :: m, n, nb
109 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
114 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
115 bind(c, name=
'opencl_gather_kernel')
116 use,
intrinsic :: iso_c_binding
118 integer(c_int) :: m, n, nb, o, op
119 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
125 bind(c, name=
'opencl_scatter_kernel')
126 use,
intrinsic :: iso_c_binding
128 integer(c_int) :: m, n, nb
129 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
139 integer,
intent(in) :: nlocal
140 integer,
intent(in) :: nshared
141 integer,
intent(in) :: nlcl_blks
142 integer,
intent(in) :: nshrd_blks
147 this%nshared = nshared
149 allocate(this%local_blk_off(nlcl_blks))
150 allocate(this%shared_blk_off(nshrd_blks))
152 this%local_gs_d = c_null_ptr
153 this%local_dof_gs_d = c_null_ptr
154 this%local_gs_dof_d = c_null_ptr
155 this%local_blk_len_d = c_null_ptr
156 this%local_blk_off_d = c_null_ptr
157 this%shared_gs_d = c_null_ptr
158 this%shared_dof_gs_d = c_null_ptr
159 this%shared_gs_dof_d = c_null_ptr
160 this%shared_blk_len_d = c_null_ptr
161 this%shared_blk_off_d = c_null_ptr
163 this%shared_on_host = .true.
165#if defined(HAVE_HIP) || defined(HAVE_CUDA)
178 if (
allocated(this%local_blk_off))
then
179 deallocate(this%local_blk_off)
182 if (
allocated(this%shared_blk_off))
then
183 deallocate(this%shared_blk_off)
186 if (c_associated(this%local_gs_d))
then
190 if (c_associated(this%local_dof_gs_d))
then
194 if (c_associated(this%local_gs_dof_d))
then
198 if (c_associated(this%local_blk_len_d))
then
202 if (c_associated(this%shared_blk_len_d))
then
206 if (c_associated(this%local_blk_off_d))
then
210 if (c_associated(this%shared_blk_off_d))
then
217#if defined(HAVE_HIP) || defined(HAVE_CUDA)
218 if (c_associated(this%gather_event))
then
222 if (c_associated(this%scatter_event))
then
227 if (c_associated(this%gs_stream))
then
228 this%gs_stream = c_null_ptr
234 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
235 integer,
intent(in) :: m
236 integer,
intent(in) :: n
237 integer,
intent(in) :: nb
239 real(kind=
rp),
dimension(m),
intent(inout) :: v
240 integer,
dimension(m),
intent(inout) :: dg
241 real(kind=
rp),
dimension(n),
intent(inout) :: u
242 integer,
dimension(m),
intent(inout) :: gd
243 integer,
dimension(nb),
intent(inout) :: b
244 integer,
intent(in) :: o
245 integer,
intent(in) :: op
246 logical,
intent(in) :: shrd
253 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
254 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
255 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
256 strm=>this%gs_stream)
258 if (.not. c_associated(v_d))
then
261 real(c_rp) :: rp_dummy
262 integer(c_size_t) :: s
263 s = c_sizeof(rp_dummy) * m
268 if (.not. c_associated(dg_d))
then
271 sync=.false., strm=strm)
274 if (.not. c_associated(gd_d))
then
277 sync=.false., strm=strm)
281 if (.not. c_associated(b_d))
then
284 sync=.false., strm=strm)
287 if (.not. c_associated(bo_d))
then
291 bo(i) = bo(i - 1) + b(i - 1)
294 sync=.false., strm=strm)
300 nb, b_d, bo_d, op, strm)
303 nb, b_d, bo_d, op, strm)
306 nb, b_d, bo_d, op, strm)
308 call neko_error(
'No device backend configured')
313 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
314 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
315 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
316 strm=>this%gs_stream)
318 if (.not. c_associated(v_d))
then
319 call device_map(v, v_d, m)
321 real(c_rp) :: rp_dummy
322 integer(c_size_t) :: s
323 s = c_sizeof(rp_dummy) * m
324 call device_memset(v_d, 0, s, strm = strm)
328 if (.not. c_associated(dg_d))
then
329 call device_map(dg, dg_d, m)
330 call device_memcpy(dg, dg_d, m, host_to_device, &
331 sync=.false., strm=strm)
334 if (.not. c_associated(gd_d))
then
335 call device_map(gd, gd_d, m)
336 call device_memcpy(gd, gd_d, m, host_to_device, &
337 sync=.false., strm=strm)
341 if (.not. c_associated(b_d))
then
342 call device_map(b, b_d, nb)
343 call device_memcpy(b, b_d, nb, host_to_device, &
344 sync=.false., strm=strm)
347 if (.not. c_associated(bo_d))
then
348 call device_map(bo, bo_d, nb)
351 bo(i) = bo(i - 1) + b(i - 1)
353 call device_memcpy(bo, bo_d, nb, host_to_device, &
354 sync=.false., strm=strm)
361 nb, b_d, bo_d, op, strm)
364 nb, b_d, bo_d, op, strm)
367 nb, b_d, bo_d, op, strm)
369 call neko_error(
'No device backend configured')
372#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
373 call device_event_record(this%gather_event, strm)
376 if (this%shared_on_host)
then
377 if (this%nshared .eq. m)
then
378 call device_memcpy(v, v_d, m, device_to_host, &
379 sync=.true., strm=strm)
389 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
390 integer,
intent(in) :: m
391 integer,
intent(in) :: n
392 integer,
intent(in) :: nb
394 real(kind=rp),
dimension(m),
intent(inout) :: v
395 integer,
dimension(m),
intent(inout) :: dg
396 real(kind=rp),
dimension(n),
intent(inout) :: u
397 integer,
dimension(m),
intent(inout) :: gd
398 integer,
dimension(nb),
intent(inout) :: b
399 logical,
intent(in) :: shrd
403 u_d = device_get_ptr(u)
406 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
407 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
408 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
416 call neko_error(
'No device backend configured')
420 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
421 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
422 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
424 if (this%shared_on_host)
then
425 call device_memcpy(v, v_d, m, host_to_device, &
426 sync=.false., strm=strm)
436 call neko_error(
'No device backend configured')
441#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
442 if (c_associated(event))
then
443 call device_event_record(event, this%gs_stream)
445 call device_sync(this%gs_stream)
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Synchronize a device or stream.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
integer, parameter, public device_to_host
subroutine, public device_event_destroy(event)
Destroy a device event.
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
subroutine, public device_event_create(event, flags)
Create a device event queue.
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Defines a gather-scatter backend.
Generic Gather-scatter backend for accelerators.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Gather-scatter backend for offloading devices.