42 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
49 type(c_ptr) :: local_gs_d = c_null_ptr
50 type(c_ptr) :: local_dof_gs_d = c_null_ptr
51 type(c_ptr) :: local_gs_dof_d = c_null_ptr
52 type(c_ptr) :: shared_gs_d = c_null_ptr
53 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
54 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
55 type(c_ptr) :: local_blk_len_d = c_null_ptr
56 type(c_ptr) :: shared_blk_len_d = c_null_ptr
57 type(c_ptr) :: local_blk_off_d = c_null_ptr
58 type(c_ptr) :: shared_blk_off_d = c_null_ptr
61 logical :: shared_on_host
71 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
72 bind(c, name =
'hip_gather_kernel')
73 use,
intrinsic :: iso_c_binding
75 integer(c_int) :: m, n, nb, o, op
76 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
82 bind(c, name =
'hip_scatter_kernel')
83 use,
intrinsic :: iso_c_binding
85 integer(c_int) :: m, n, nb
86 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
92 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
93 bind(c, name =
'cuda_gather_kernel')
94 use,
intrinsic :: iso_c_binding
96 integer(c_int) :: m, n, nb, o, op
97 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
103 bind(c, name =
'cuda_scatter_kernel')
104 use,
intrinsic :: iso_c_binding
106 integer(c_int) :: m, n, nb
107 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
112 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
113 strm) bind(c, name = 'opencl_gather_kernel')
114 use,
intrinsic :: iso_c_binding
116 integer(c_int) :: m, n, nb, o, op
117 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
123 bind(c, name =
'opencl_scatter_kernel')
124 use,
intrinsic :: iso_c_binding
126 integer(c_int) :: m, n, nb
127 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
132 subroutine metal_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
133 strm) bind(c, name = 'metal_gather_kernel')
134 use,
intrinsic :: iso_c_binding
136 integer(c_int) :: m, n, nb, o, op
137 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
138 end subroutine metal_gather_kernel
142 subroutine metal_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
143 bind(c, name =
'metal_scatter_kernel')
144 use,
intrinsic :: iso_c_binding
146 integer(c_int) :: m, n, nb
147 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
148 end subroutine metal_scatter_kernel
157 integer,
intent(in) :: nlocal
158 integer,
intent(in) :: nshared
159 integer,
intent(in) :: nlcl_blks
160 integer,
intent(in) :: nshrd_blks
165 this%nshared = nshared
167 this%local_gs_d = c_null_ptr
168 this%local_dof_gs_d = c_null_ptr
169 this%local_gs_dof_d = c_null_ptr
170 this%local_blk_len_d = c_null_ptr
171 this%local_blk_off_d = c_null_ptr
172 this%shared_gs_d = c_null_ptr
173 this%shared_dof_gs_d = c_null_ptr
174 this%shared_gs_dof_d = c_null_ptr
175 this%shared_blk_len_d = c_null_ptr
176 this%shared_blk_off_d = c_null_ptr
178 this%shared_on_host = .true.
180#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_METAL)
193 if (c_associated(this%local_gs_d))
then
197 if (c_associated(this%local_dof_gs_d))
then
201 if (c_associated(this%local_gs_dof_d))
then
205 if (c_associated(this%local_blk_len_d))
then
209 if (c_associated(this%shared_blk_len_d))
then
213 if (c_associated(this%local_blk_off_d))
then
217 if (c_associated(this%shared_blk_off_d))
then
224#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_METAL)
225 if (c_associated(this%gather_event))
then
229 if (c_associated(this%scatter_event))
then
234 if (c_associated(this%gs_stream))
then
235 this%gs_stream = c_null_ptr
241 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
242 integer,
intent(in) :: m
243 integer,
intent(in) :: n
244 integer,
intent(in) :: nb
246 real(kind=
rp),
dimension(m),
intent(inout) :: v
247 integer,
dimension(m),
intent(inout) :: dg
248 real(kind=
rp),
dimension(n),
intent(inout) :: u
249 integer,
dimension(m),
intent(inout) :: gd
250 integer,
dimension(nb),
intent(inout) :: b
251 integer,
dimension(nb),
intent(inout) :: bo
252 integer,
intent(in) :: o
253 integer,
intent(in) :: op
254 logical,
intent(in) :: shrd
261 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
262 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
263 bo_d => this%local_blk_off_d, strm => this%gs_stream)
265 if (.not. c_associated(v_d))
then
268 real(c_rp) :: rp_dummy
269 integer(c_size_t) :: s
270 s = c_sizeof(rp_dummy) * m
275 if (.not. c_associated(dg_d))
then
278 sync = .false., strm = strm)
281 if (.not. c_associated(gd_d))
then
284 sync = .false., strm = strm)
288 if (.not. c_associated(b_d))
then
291 sync = .false., strm = strm)
294 if (.not. c_associated(bo_d))
then
297 sync = .false., strm = strm)
303 nb, b_d, bo_d, op, strm)
306 nb, b_d, bo_d, op, strm)
309 nb, b_d, bo_d, op, strm)
311 call metal_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
312 nb, b_d, bo_d, op, strm)
314 call neko_error(
'No device backend configured')
319 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
320 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
321 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
323 if (.not. c_associated(v_d))
then
326 real(c_rp) :: rp_dummy
327 integer(c_size_t) :: s
328 s = c_sizeof(rp_dummy) * m
333 if (.not. c_associated(dg_d))
then
336 sync = .false., strm = strm)
339 if (.not. c_associated(gd_d))
then
342 sync = .false., strm = strm)
346 if (.not. c_associated(b_d))
then
349 sync = .false., strm = strm)
352 if (.not. c_associated(bo_d))
then
355 sync = .false., strm = strm)
362 nb, b_d, bo_d, op, strm)
365 nb, b_d, bo_d, op, strm)
368 nb, b_d, bo_d, op, strm)
370 call metal_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
371 nb, b_d, bo_d, op, strm)
373 call neko_error(
'No device backend configured')
376#if defined(HAVE_HIP) || defined(HAVE_CUDA) || \
377 defined(have_opencl) || defined(have_metal)
381 if (this%shared_on_host)
then
382 if (this%nshared .eq. m)
then
384 sync = .true., strm = strm)
394 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
395 integer,
intent(in) :: m
396 integer,
intent(in) :: n
397 integer,
intent(in) :: nb
399 real(kind=rp),
dimension(m),
intent(inout) :: v
400 integer,
dimension(m),
intent(inout) :: dg
401 real(kind=rp),
dimension(n),
intent(inout) :: u
402 integer,
dimension(m),
intent(inout) :: gd
403 integer,
dimension(nb),
intent(inout) :: b
404 integer,
dimension(nb),
intent(inout) :: bo
405 logical,
intent(in) :: shrd
409 u_d = device_get_ptr(u)
412 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
413 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
414 bo_d => this%local_blk_off_d, strm => this%gs_stream)
425 call metal_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
428 call neko_error(
'No device backend configured')
432 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
433 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
434 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
436 if (this%shared_on_host)
then
437 call device_memcpy(v, v_d, m, host_to_device, &
438 sync = .false., strm = strm)
451 call metal_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
454 call neko_error(
'No device backend configured')
459#if defined(HAVE_HIP) || defined(HAVE_CUDA) || \
460 defined(have_opencl) || defined(have_metal)
461 if (c_associated(event))
then
462 call device_event_record(event, this%gs_stream)
464 call device_sync(this%gs_stream)
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Synchronize a device or stream.
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
integer, parameter, public device_to_host
subroutine, public device_event_destroy(event)
Destroy a device event.
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
subroutine, public device_event_create(event, flags)
Create a device event queue.
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Defines a gather-scatter backend.
Generic Gather-scatter backend for accelerators.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
Gather kernel.
subroutine gs_device_free(this)
Dummy backend deallocation.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
Scatter kernel.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Gather-scatter backend for offloading devices.