41 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
48 integer,
allocatable :: local_blk_off(:)
49 integer,
allocatable :: shared_blk_off(:)
50 type(c_ptr) :: local_gs_d = c_null_ptr
51 type(c_ptr) :: local_dof_gs_d = c_null_ptr
52 type(c_ptr) :: local_gs_dof_d = c_null_ptr
53 type(c_ptr) :: shared_gs_d = c_null_ptr
54 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56 type(c_ptr) :: local_blk_len_d = c_null_ptr
57 type(c_ptr) :: shared_blk_len_d = c_null_ptr
58 type(c_ptr) :: local_blk_off_d = c_null_ptr
59 type(c_ptr) :: shared_blk_off_d = c_null_ptr
62 logical :: shared_on_host
72 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73 bind(c, name=
'hip_gather_kernel')
74 use,
intrinsic :: iso_c_binding
76 integer(c_int) :: m, n, nb, o, op
77 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
83 bind(c, name=
'hip_scatter_kernel')
84 use,
intrinsic :: iso_c_binding
86 integer(c_int) :: m, n, nb
87 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
93 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94 bind(c, name=
'cuda_gather_kernel')
95 use,
intrinsic :: iso_c_binding
97 integer(c_int) :: m, n, nb, o, op
98 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
104 bind(c, name=
'cuda_scatter_kernel')
105 use,
intrinsic :: iso_c_binding
107 integer(c_int) :: m, n, nb
108 type(c_ptr),
value :: v, u, dg, gd, b, bo, strm
113 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114 bind(c, name=
'opencl_gather_kernel')
115 use,
intrinsic :: iso_c_binding
117 integer(c_int) :: m, n, nb, o, op
118 type(c_ptr),
value :: v, u, dg, gd, b, bo
124 bind(c, name=
'opencl_scatter_kernel')
125 use,
intrinsic :: iso_c_binding
127 integer(c_int) :: m, n, nb
128 type(c_ptr),
value :: v, u, dg, gd, b, bo
138 integer,
intent(in) :: nlocal
139 integer,
intent(in) :: nshared
140 integer,
intent(in) :: nlcl_blks
141 integer,
intent(in) :: nshrd_blks
146 this%nshared = nshared
148 allocate(this%local_blk_off(nlcl_blks))
149 allocate(this%shared_blk_off(nshrd_blks))
151 this%local_gs_d = c_null_ptr
152 this%local_dof_gs_d = c_null_ptr
153 this%local_gs_dof_d = c_null_ptr
154 this%local_blk_len_d = c_null_ptr
155 this%local_blk_off_d = c_null_ptr
156 this%shared_gs_d = c_null_ptr
157 this%shared_dof_gs_d = c_null_ptr
158 this%shared_gs_dof_d = c_null_ptr
159 this%shared_blk_len_d = c_null_ptr
160 this%shared_blk_off_d = c_null_ptr
162 this%shared_on_host = .true.
167 this%gs_stream = glb_cmd_queue
175 if (
allocated(this%local_blk_off))
then
176 deallocate(this%local_blk_off)
179 if (
allocated(this%shared_blk_off))
then
180 deallocate(this%shared_blk_off)
183 if (c_associated(this%local_gs_d))
then
187 if (c_associated(this%local_dof_gs_d))
then
191 if (c_associated(this%local_gs_dof_d))
then
195 if (c_associated(this%local_blk_len_d))
then
199 if (c_associated(this%shared_blk_len_d))
then
203 if (c_associated(this%local_blk_off_d))
then
207 if (c_associated(this%shared_blk_off_d))
then
214 if (c_associated(this%gather_event))
then
218 if (c_associated(this%scatter_event))
then
222 if (c_associated(this%gs_stream))
then
223 this%gs_stream = c_null_ptr
229 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
230 integer,
intent(in) :: m
231 integer,
intent(in) :: n
232 integer,
intent(in) :: nb
234 real(kind=
rp),
dimension(m),
intent(inout) :: v
235 integer,
dimension(m),
intent(inout) :: dg
236 real(kind=
rp),
dimension(n),
intent(inout) :: u
237 integer,
dimension(m),
intent(inout) :: gd
238 integer,
dimension(nb),
intent(inout) :: b
239 integer,
intent(in) :: o
240 integer,
intent(in) :: op
241 logical,
intent(in) :: shrd
248 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
249 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
250 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
251 strm=>this%gs_stream)
253 if (.not. c_associated(v_d))
then
257 if (.not. c_associated(dg_d))
then
260 sync=.false., strm=strm)
263 if (.not. c_associated(gd_d))
then
266 sync=.false., strm=strm)
269 if (.not. c_associated(b_d))
then
272 sync=.false., strm=strm)
275 if (.not. c_associated(bo_d))
then
279 bo(i) = bo(i - 1) + b(i - 1)
282 sync=.false., strm=strm)
287 nb, b_d, bo_d, op, strm)
290 nb, b_d, bo_d, op, strm)
295 call neko_error(
'No device backend configured')
300 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
301 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
302 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
303 strm=>this%gs_stream)
305 if (.not. c_associated(v_d))
then
309 if (.not. c_associated(dg_d))
then
312 sync=.false., strm=strm)
315 if (.not. c_associated(gd_d))
then
318 sync=.false., strm=strm)
321 if (.not. c_associated(b_d))
then
324 sync=.false., strm=strm)
327 if (.not. c_associated(bo_d))
then
331 bo(i) = bo(i - 1) + b(i - 1)
334 sync=.false., strm=strm)
340 nb, b_d, bo_d, op, strm)
343 nb, b_d, bo_d, op, strm)
348 call neko_error(
'No device backend configured')
353 if (this%shared_on_host)
then
354 if (this%nshared .eq. m)
then
356 sync=.true., strm=strm)
366 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
367 integer,
intent(in) :: m
368 integer,
intent(in) :: n
369 integer,
intent(in) :: nb
371 real(kind=rp),
dimension(m),
intent(inout) :: v
372 integer,
dimension(m),
intent(inout) :: dg
373 real(kind=rp),
dimension(n),
intent(inout) :: u
374 integer,
dimension(m),
intent(inout) :: gd
375 integer,
dimension(nb),
intent(inout) :: b
376 logical,
intent(in) :: shrd
380 u_d = device_get_ptr(u)
383 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
384 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
385 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
393 call neko_error(
'No device backend configured')
397 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
398 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
399 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
401 if (this%shared_on_host)
then
402 call device_memcpy(v, v_d, m, host_to_device, &
403 sync=.false., strm=strm)
413 call neko_error(
'No device backend configured')
416 if (c_associated(event))
then
417 call device_event_record(event, strm)
419 call device_sync(strm)
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Device abstraction, common interface for various accelerators.
subroutine, public device_event_record(event, stream)
Record a device event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
integer, parameter, public device_to_host
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a gather-scatter backend.
Generic Gather-scatter backend for accelerators.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
Defines Gather-scatter operations.
integer, parameter, public rp
Global precision used in computations.
Gather-scatter backend for offloading devices.