41 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
48 integer,
allocatable :: local_blk_off(:)
49 integer,
allocatable :: shared_blk_off(:)
50 type(c_ptr) :: local_gs_d = c_null_ptr
51 type(c_ptr) :: local_dof_gs_d = c_null_ptr
52 type(c_ptr) :: local_gs_dof_d = c_null_ptr
53 type(c_ptr) :: shared_gs_d = c_null_ptr
54 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56 type(c_ptr) :: local_blk_len_d = c_null_ptr
57 type(c_ptr) :: shared_blk_len_d = c_null_ptr
58 type(c_ptr) :: local_blk_off_d = c_null_ptr
59 type(c_ptr) :: shared_blk_off_d = c_null_ptr
62 logical :: shared_on_host
72 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73 bind(c, name=
'hip_gather_kernel')
74 use,
intrinsic :: iso_c_binding
76 integer(c_int) :: m, n, nb, o, op
77 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
83 bind(c, name=
'hip_scatter_kernel')
84 use,
intrinsic :: iso_c_binding
86 integer(c_int) :: m, n, nb
87 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
93 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94 bind(c, name=
'cuda_gather_kernel')
95 use,
intrinsic :: iso_c_binding
97 integer(c_int) :: m, n, nb, o, op
98 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
104 bind(c, name=
'cuda_scatter_kernel')
105 use,
intrinsic :: iso_c_binding
107 integer(c_int) :: m, n, nb
108 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
113 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114 bind(c, name=
'opencl_gather_kernel')
115 use,
intrinsic :: iso_c_binding
117 integer(c_int) :: m, n, nb, o, op
118 type(c_ptr),
value :: v, u, dg, gd, b, bo
124 bind(c, name=
'opencl_scatter_kernel')
125 use,
intrinsic :: iso_c_binding
127 integer(c_int) :: m, n, nb
128 type(c_ptr),
value :: v, u, dg, gd, b, bo
138 integer,
intent(in) :: nlocal
139 integer,
intent(in) :: nshared
140 integer,
intent(in) :: nlcl_blks
141 integer,
intent(in) :: nshrd_blks
146 this%nshared = nshared
148 allocate(this%local_blk_off(nlcl_blks))
149 allocate(this%shared_blk_off(nshrd_blks))
151 this%local_gs_d = c_null_ptr
152 this%local_dof_gs_d = c_null_ptr
153 this%local_gs_dof_d = c_null_ptr
154 this%local_blk_len_d = c_null_ptr
155 this%local_blk_off_d = c_null_ptr
156 this%shared_gs_d = c_null_ptr
157 this%shared_dof_gs_d = c_null_ptr
158 this%shared_gs_dof_d = c_null_ptr
159 this%shared_blk_len_d = c_null_ptr
160 this%shared_blk_off_d = c_null_ptr
162 this%shared_on_host = .true.
164#if defined(HAVE_HIP) || defined(HAVE_CUDA)
169 this%gs_stream = glb_cmd_queue
177 if (
allocated(this%local_blk_off))
then
178 deallocate(this%local_blk_off)
181 if (
allocated(this%shared_blk_off))
then
182 deallocate(this%shared_blk_off)
185 if (c_associated(this%local_gs_d))
then
189 if (c_associated(this%local_dof_gs_d))
then
193 if (c_associated(this%local_gs_dof_d))
then
197 if (c_associated(this%local_blk_len_d))
then
201 if (c_associated(this%shared_blk_len_d))
then
205 if (c_associated(this%local_blk_off_d))
then
209 if (c_associated(this%shared_blk_off_d))
then
216#if defined(HAVE_HIP) || defined(HAVE_CUDA)
217 if (c_associated(this%gather_event))
then
221 if (c_associated(this%scatter_event))
then
226 if (c_associated(this%gs_stream))
then
227 this%gs_stream = c_null_ptr
233 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
234 integer,
intent(in) :: m
235 integer,
intent(in) :: n
236 integer,
intent(in) :: nb
238 real(kind=
rp),
dimension(m),
intent(inout) :: v
239 integer,
dimension(m),
intent(inout) :: dg
240 real(kind=
rp),
dimension(n),
intent(inout) :: u
241 integer,
dimension(m),
intent(inout) :: gd
242 integer,
dimension(nb),
intent(inout) :: b
243 integer,
intent(in) :: o
244 integer,
intent(in) :: op
245 logical,
intent(in) :: shrd
252 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
253 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
254 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
255 strm=>this%gs_stream)
257 if (.not. c_associated(v_d))
then
261 if (.not. c_associated(dg_d))
then
264 sync=.false., strm=strm)
267 if (.not. c_associated(gd_d))
then
270 sync=.false., strm=strm)
273 if (.not. c_associated(b_d))
then
276 sync=.false., strm=strm)
279 if (.not. c_associated(bo_d))
then
283 bo(i) = bo(i - 1) + b(i - 1)
286 sync=.false., strm=strm)
291 nb, b_d, bo_d, op, strm)
294 nb, b_d, bo_d, op, strm)
299 call neko_error(
'No device backend configured')
304 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
305 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
306 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
307 strm=>this%gs_stream)
309 if (.not. c_associated(v_d))
then
313 if (.not. c_associated(dg_d))
then
316 sync=.false., strm=strm)
319 if (.not. c_associated(gd_d))
then
322 sync=.false., strm=strm)
325 if (.not. c_associated(b_d))
then
328 sync=.false., strm=strm)
331 if (.not. c_associated(bo_d))
then
335 bo(i) = bo(i - 1) + b(i - 1)
338 sync=.false., strm=strm)
344 nb, b_d, bo_d, op, strm)
347 nb, b_d, bo_d, op, strm)
352 call neko_error(
'No device backend configured')
355#if defined(HAVE_HIP) || defined(HAVE_CUDA)
359 if (this%shared_on_host)
then
360 if (this%nshared .eq. m)
then
362 sync=.true., strm=strm)
372 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
373 integer,
intent(in) :: m
374 integer,
intent(in) :: n
375 integer,
intent(in) :: nb
377 real(kind=rp),
dimension(m),
intent(inout) :: v
378 integer,
dimension(m),
intent(inout) :: dg
379 real(kind=rp),
dimension(n),
intent(inout) :: u
380 integer,
dimension(m),
intent(inout) :: gd
381 integer,
dimension(nb),
intent(inout) :: b
382 logical,
intent(in) :: shrd
386 u_d = device_get_ptr(u)
389 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
390 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
391 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
399 call neko_error(
'No device backend configured')
403 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
404 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
405 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
407 if (this%shared_on_host)
then
408 call device_memcpy(v, v_d, m, host_to_device, &
409 sync=.false., strm=strm)
419 call neko_error(
'No device backend configured')
422#if defined(HAVE_HIP) || defined(HAVE_CUDA)
423 if (c_associated(event))
then
424 call device_event_record(event, strm)
426 call device_sync(strm)
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
integer, parameter, public device_to_host
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a gather-scatter backend.
Generic Gather-scatter backend for accelerators.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
Defines Gather-scatter operations.
integer, parameter, public rp
Global precision used in computations.
Gather-scatter backend for offloading devices.