Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_bcknd, only : gs_bcknd_t
41 use utils, only: neko_error
42 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
44 implicit none
45 private
46
48 type, public, extends(gs_bcknd_t) :: gs_device_t
49 integer, allocatable :: local_blk_off(:)
50 integer, allocatable :: shared_blk_off(:)
51 type(c_ptr) :: local_gs_d = c_null_ptr
52 type(c_ptr) :: local_dof_gs_d = c_null_ptr
53 type(c_ptr) :: local_gs_dof_d = c_null_ptr
54 type(c_ptr) :: shared_gs_d = c_null_ptr
55 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
56 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
57 type(c_ptr) :: local_blk_len_d = c_null_ptr
58 type(c_ptr) :: shared_blk_len_d = c_null_ptr
59 type(c_ptr) :: local_blk_off_d = c_null_ptr
60 type(c_ptr) :: shared_blk_off_d = c_null_ptr
61 integer :: nlocal
62 integer :: nshared
63 logical :: shared_on_host
64 contains
65 procedure, pass(this) :: init => gs_device_init
66 procedure, pass(this) :: free => gs_device_free
67 procedure, pass(this) :: gather => gs_gather_device
68 procedure, pass(this) :: scatter => gs_scatter_device
69 end type gs_device_t
70
71#ifdef HAVE_HIP
72 interface
73 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
74 bind(c, name='hip_gather_kernel')
75 use, intrinsic :: iso_c_binding
76 implicit none
77 integer(c_int) :: m, n, nb, o, op
78 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
79 end subroutine hip_gather_kernel
80 end interface
81
82 interface
83 subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
84 bind(c, name='hip_scatter_kernel')
85 use, intrinsic :: iso_c_binding
86 implicit none
87 integer(c_int) :: m, n, nb
88 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
89 end subroutine hip_scatter_kernel
90 end interface
91
92#elif HAVE_CUDA
93 interface
94 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
95 bind(c, name='cuda_gather_kernel')
96 use, intrinsic :: iso_c_binding
97 implicit none
98 integer(c_int) :: m, n, nb, o, op
99 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
100 end subroutine cuda_gather_kernel
101 end interface
102
103 interface
104 subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
105 bind(c, name='cuda_scatter_kernel')
106 use, intrinsic :: iso_c_binding
107 implicit none
108 integer(c_int) :: m, n, nb
109 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
110 end subroutine cuda_scatter_kernel
111 end interface
112#elif HAVE_OPENCL
113 interface
114 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
115 bind(c, name='opencl_gather_kernel')
116 use, intrinsic :: iso_c_binding
117 implicit none
118 integer(c_int) :: m, n, nb, o, op
119 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
120 end subroutine opencl_gather_kernel
121 end interface
122
123 interface
124 subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
125 bind(c, name='opencl_scatter_kernel')
126 use, intrinsic :: iso_c_binding
127 implicit none
128 integer(c_int) :: m, n, nb
129 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
130 end subroutine opencl_scatter_kernel
131 end interface
132#endif
133
134contains
135
137 subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
138 class(gs_device_t), intent(inout) :: this
139 integer, intent(in) :: nlocal
140 integer, intent(in) :: nshared
141 integer, intent(in) :: nlcl_blks
142 integer, intent(in) :: nshrd_blks
143
144 call this%free()
145
146 this%nlocal = nlocal
147 this%nshared = nshared
148
149 allocate(this%local_blk_off(nlcl_blks))
150 allocate(this%shared_blk_off(nshrd_blks))
151
152 this%local_gs_d = c_null_ptr
153 this%local_dof_gs_d = c_null_ptr
154 this%local_gs_dof_d = c_null_ptr
155 this%local_blk_len_d = c_null_ptr
156 this%local_blk_off_d = c_null_ptr
157 this%shared_gs_d = c_null_ptr
158 this%shared_dof_gs_d = c_null_ptr
159 this%shared_gs_dof_d = c_null_ptr
160 this%shared_blk_len_d = c_null_ptr
161 this%shared_blk_off_d = c_null_ptr
162
163 this%shared_on_host = .true.
164
165#if defined(HAVE_HIP) || defined(HAVE_CUDA)
166 call device_event_create(this%gather_event, 2)
167 call device_event_create(this%scatter_event, 2)
168#endif
169
170 this%gs_stream = glb_cmd_queue
171
172 end subroutine gs_device_init
173
175 subroutine gs_device_free(this)
176 class(gs_device_t), intent(inout) :: this
177
178 if (allocated(this%local_blk_off)) then
179 deallocate(this%local_blk_off)
180 end if
181
182 if (allocated(this%shared_blk_off)) then
183 deallocate(this%shared_blk_off)
184 end if
185
186 if (c_associated(this%local_gs_d)) then
187 call device_free(this%local_gs_d)
188 end if
189
190 if (c_associated(this%local_dof_gs_d)) then
191 call device_free(this%local_dof_gs_d)
192 end if
193
194 if (c_associated(this%local_gs_dof_d)) then
195 call device_free(this%local_gs_dof_d)
196 end if
197
198 if (c_associated(this%local_blk_len_d)) then
199 call device_free(this%local_blk_len_d)
200 end if
201
202 if (c_associated(this%shared_blk_len_d)) then
203 call device_free(this%shared_blk_len_d)
204 end if
205
206 if (c_associated(this%local_blk_off_d)) then
207 call device_free(this%local_blk_off_d)
208 end if
209
210 if (c_associated(this%shared_blk_off_d)) then
211 call device_free(this%shared_blk_off_d)
212 end if
213
214 this%nlocal = 0
215 this%nshared = 0
216
217#if defined(HAVE_HIP) || defined(HAVE_CUDA)
218 if (c_associated(this%gather_event)) then
219 call device_event_destroy(this%gather_event)
220 end if
221
222 if (c_associated(this%scatter_event)) then
223 call device_event_destroy(this%scatter_event)
224 end if
225#endif
226
227 if (c_associated(this%gs_stream)) then
228 this%gs_stream = c_null_ptr
229 end if
230
231 end subroutine gs_device_free
232
234 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
235 integer, intent(in) :: m
236 integer, intent(in) :: n
237 integer, intent(in) :: nb
238 class(gs_device_t), intent(inout) :: this
239 real(kind=rp), dimension(m), intent(inout) :: v
240 integer, dimension(m), intent(inout) :: dg
241 real(kind=rp), dimension(n), intent(inout) :: u
242 integer, dimension(m), intent(inout) :: gd
243 integer, dimension(nb), intent(inout) :: b
244 integer, intent(in) :: o
245 integer, intent(in) :: op
246 logical, intent(in) :: shrd
247 integer :: i
248 type(c_ptr) :: u_d
249
250 u_d = device_get_ptr(u)
251
252 if (.not. shrd) then
253 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
254 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
255 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
256 strm=>this%gs_stream)
257
258 if (.not. c_associated(v_d)) then
259 call device_map(v, v_d, m)
260 block
261 real(c_rp) :: rp_dummy
262 integer(c_size_t) :: s
263 s = c_sizeof(rp_dummy) * m
264 call device_memset(v_d, 0, s, strm = strm)
265 end block
266 end if
267
268 if (.not. c_associated(dg_d)) then
269 call device_map(dg, dg_d, m)
270 call device_memcpy(dg, dg_d, m, host_to_device, &
271 sync=.false., strm=strm)
272 end if
273
274 if (.not. c_associated(gd_d)) then
275 call device_map(gd, gd_d, m)
276 call device_memcpy(gd, gd_d, m, host_to_device, &
277 sync=.false., strm=strm)
278 end if
279
280 if (nb .gt. 0) then
281 if (.not. c_associated(b_d)) then
282 call device_map(b, b_d, nb)
283 call device_memcpy(b, b_d, nb, host_to_device, &
284 sync=.false., strm=strm)
285 end if
286
287 if (.not. c_associated(bo_d)) then
288 call device_map(bo, bo_d, nb)
289 bo(1) = 0
290 do i = 2, nb
291 bo(i) = bo(i - 1) + b(i - 1)
292 end do
293 call device_memcpy(bo, bo_d, nb, host_to_device, &
294 sync=.false., strm=strm)
295 end if
296 end if
297
298#ifdef HAVE_HIP
299 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
300 nb, b_d, bo_d, op, strm)
301#elif HAVE_CUDA
302 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
303 nb, b_d, bo_d, op, strm)
304#elif HAVE_OPENCL
305 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
306 nb, b_d, bo_d, op, strm)
307#else
308 call neko_error('No device backend configured')
309#endif
310
311 end associate
312 else if (shrd) then
313 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
314 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
315 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
316 strm=>this%gs_stream)
317
318 if (.not. c_associated(v_d)) then
319 call device_map(v, v_d, m)
320 block
321 real(c_rp) :: rp_dummy
322 integer(c_size_t) :: s
323 s = c_sizeof(rp_dummy) * m
324 call device_memset(v_d, 0, s, strm = strm)
325 end block
326 end if
327
328 if (.not. c_associated(dg_d)) then
329 call device_map(dg, dg_d, m)
330 call device_memcpy(dg, dg_d, m, host_to_device, &
331 sync=.false., strm=strm)
332 end if
333
334 if (.not. c_associated(gd_d)) then
335 call device_map(gd, gd_d, m)
336 call device_memcpy(gd, gd_d, m, host_to_device, &
337 sync=.false., strm=strm)
338 end if
339
340 if (nb .gt. 0) then
341 if (.not. c_associated(b_d)) then
342 call device_map(b, b_d, nb)
343 call device_memcpy(b, b_d, nb, host_to_device, &
344 sync=.false., strm=strm)
345 end if
346
347 if (.not. c_associated(bo_d)) then
348 call device_map(bo, bo_d, nb)
349 bo(1) = 0
350 do i = 2, nb
351 bo(i) = bo(i - 1) + b(i - 1)
352 end do
353 call device_memcpy(bo, bo_d, nb, host_to_device, &
354 sync=.false., strm=strm)
355 end if
356 end if
357
358
359#ifdef HAVE_HIP
360 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
361 nb, b_d, bo_d, op, strm)
362#elif HAVE_CUDA
363 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
364 nb, b_d, bo_d, op, strm)
365#elif HAVE_OPENCL
366 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
367 nb, b_d, bo_d, op, strm)
368#else
369 call neko_error('No device backend configured')
370#endif
371
372#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
373 call device_event_record(this%gather_event, strm)
374#endif
375
376 if (this%shared_on_host) then
377 if (this%nshared .eq. m) then
378 call device_memcpy(v, v_d, m, device_to_host, &
379 sync=.true., strm=strm)
380 end if
381 end if
382
383 end associate
384 end if
385
386 end subroutine gs_gather_device
387
389 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
390 integer, intent(in) :: m
391 integer, intent(in) :: n
392 integer, intent(in) :: nb
393 class(gs_device_t), intent(inout) :: this
394 real(kind=rp), dimension(m), intent(inout) :: v
395 integer, dimension(m), intent(inout) :: dg
396 real(kind=rp), dimension(n), intent(inout) :: u
397 integer, dimension(m), intent(inout) :: gd
398 integer, dimension(nb), intent(inout) :: b
399 logical, intent(in) :: shrd
400 type(c_ptr) :: event
401 type(c_ptr) :: u_d
402
403 u_d = device_get_ptr(u)
404
405 if (.not. shrd) then
406 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
407 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
408 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
409#ifdef HAVE_HIP
410 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
411#elif HAVE_CUDA
412 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
413#elif HAVE_OPENCL
414 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
415#else
416 call neko_error('No device backend configured')
417#endif
418 end associate
419 else if (shrd) then
420 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
421 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
422 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
423
424 if (this%shared_on_host) then
425 call device_memcpy(v, v_d, m, host_to_device, &
426 sync=.false., strm=strm)
427 end if
428
429#ifdef HAVE_HIP
430 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
431#elif HAVE_CUDA
432 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
433#elif HAVE_OPENCL
434 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
435#else
436 call neko_error('No device backend configured')
437#endif
438 end associate
439 end if
440
441#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
442 if (c_associated(event)) then
443 call device_event_record(event, this%gs_stream)
444 else
445 call device_sync(this%gs_stream)
446 end if
447#endif
448
449
450 end subroutine gs_scatter_device
451
452end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
Definition gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
Definition gs.c:166
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96
Return the device pointer for an associated Fortran array.
Definition device.F90:96
Map a Fortran array to a device (allocate and associate)
Definition device.F90:72
Copy data between host and device (or device and device)
Definition device.F90:66
Synchronize a device or stream.
Definition device.F90:102
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1290
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:214
integer, parameter, public device_to_host
Definition device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1274
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1244
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Definition device.F90:233
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48