Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_bcknd, only : gs_bcknd_t
41 use utils, only : neko_error
42 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
44 implicit none
45 private
46
48 type, public, extends(gs_bcknd_t) :: gs_device_t
49 type(c_ptr) :: local_gs_d = c_null_ptr
50 type(c_ptr) :: local_dof_gs_d = c_null_ptr
51 type(c_ptr) :: local_gs_dof_d = c_null_ptr
52 type(c_ptr) :: shared_gs_d = c_null_ptr
53 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
54 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
55 type(c_ptr) :: local_blk_len_d = c_null_ptr
56 type(c_ptr) :: shared_blk_len_d = c_null_ptr
57 type(c_ptr) :: local_blk_off_d = c_null_ptr
58 type(c_ptr) :: shared_blk_off_d = c_null_ptr
59 integer :: nlocal
60 integer :: nshared
61 logical :: shared_on_host
62 contains
63 procedure, pass(this) :: init => gs_device_init
64 procedure, pass(this) :: free => gs_device_free
65 procedure, pass(this) :: gather => gs_gather_device
66 procedure, pass(this) :: scatter => gs_scatter_device
67 end type gs_device_t
68
69#ifdef HAVE_HIP
70 interface
71 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
72 bind(c, name = 'hip_gather_kernel')
73 use, intrinsic :: iso_c_binding
74 implicit none
75 integer(c_int) :: m, n, nb, o, op
76 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
77 end subroutine hip_gather_kernel
78 end interface
79
80 interface
81 subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
82 bind(c, name = 'hip_scatter_kernel')
83 use, intrinsic :: iso_c_binding
84 implicit none
85 integer(c_int) :: m, n, nb
86 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
87 end subroutine hip_scatter_kernel
88 end interface
89
90#elif HAVE_CUDA
91 interface
92 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
93 bind(c, name = 'cuda_gather_kernel')
94 use, intrinsic :: iso_c_binding
95 implicit none
96 integer(c_int) :: m, n, nb, o, op
97 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
98 end subroutine cuda_gather_kernel
99 end interface
100
101 interface
102 subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
103 bind(c, name = 'cuda_scatter_kernel')
104 use, intrinsic :: iso_c_binding
105 implicit none
106 integer(c_int) :: m, n, nb
107 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
108 end subroutine cuda_scatter_kernel
109 end interface
110#elif HAVE_OPENCL
111 interface
112 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
113 strm) bind(c, name = 'opencl_gather_kernel')
114 use, intrinsic :: iso_c_binding
115 implicit none
116 integer(c_int) :: m, n, nb, o, op
117 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
118 end subroutine opencl_gather_kernel
119 end interface
120
121 interface
122 subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
123 bind(c, name = 'opencl_scatter_kernel')
124 use, intrinsic :: iso_c_binding
125 implicit none
126 integer(c_int) :: m, n, nb
127 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
128 end subroutine opencl_scatter_kernel
129 end interface
130#elif HAVE_METAL
131 interface
132 subroutine metal_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
133 strm) bind(c, name = 'metal_gather_kernel')
134 use, intrinsic :: iso_c_binding
135 implicit none
136 integer(c_int) :: m, n, nb, o, op
137 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
138 end subroutine metal_gather_kernel
139 end interface
140
141 interface
142 subroutine metal_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
143 bind(c, name = 'metal_scatter_kernel')
144 use, intrinsic :: iso_c_binding
145 implicit none
146 integer(c_int) :: m, n, nb
147 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
148 end subroutine metal_scatter_kernel
149 end interface
150#endif
151
152contains
153
155 subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
156 class(gs_device_t), intent(inout) :: this
157 integer, intent(in) :: nlocal
158 integer, intent(in) :: nshared
159 integer, intent(in) :: nlcl_blks
160 integer, intent(in) :: nshrd_blks
161
162 call this%free()
163
164 this%nlocal = nlocal
165 this%nshared = nshared
166
167 this%local_gs_d = c_null_ptr
168 this%local_dof_gs_d = c_null_ptr
169 this%local_gs_dof_d = c_null_ptr
170 this%local_blk_len_d = c_null_ptr
171 this%local_blk_off_d = c_null_ptr
172 this%shared_gs_d = c_null_ptr
173 this%shared_dof_gs_d = c_null_ptr
174 this%shared_gs_dof_d = c_null_ptr
175 this%shared_blk_len_d = c_null_ptr
176 this%shared_blk_off_d = c_null_ptr
177
178 this%shared_on_host = .true.
179
180#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_METAL)
181 call device_event_create(this%gather_event, 2)
182 call device_event_create(this%scatter_event, 2)
183#endif
184
185 this%gs_stream = glb_cmd_queue
186
187 end subroutine gs_device_init
188
190 subroutine gs_device_free(this)
191 class(gs_device_t), intent(inout) :: this
192
193 if (c_associated(this%local_gs_d)) then
194 call device_free(this%local_gs_d)
195 end if
196
197 if (c_associated(this%local_dof_gs_d)) then
198 call device_free(this%local_dof_gs_d)
199 end if
200
201 if (c_associated(this%local_gs_dof_d)) then
202 call device_free(this%local_gs_dof_d)
203 end if
204
205 if (c_associated(this%local_blk_len_d)) then
206 call device_free(this%local_blk_len_d)
207 end if
208
209 if (c_associated(this%shared_blk_len_d)) then
210 call device_free(this%shared_blk_len_d)
211 end if
212
213 if (c_associated(this%local_blk_off_d)) then
214 call device_free(this%local_blk_off_d)
215 end if
216
217 if (c_associated(this%shared_blk_off_d)) then
218 call device_free(this%shared_blk_off_d)
219 end if
220
221 this%nlocal = 0
222 this%nshared = 0
223
224#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_METAL)
225 if (c_associated(this%gather_event)) then
226 call device_event_destroy(this%gather_event)
227 end if
228
229 if (c_associated(this%scatter_event)) then
230 call device_event_destroy(this%scatter_event)
231 end if
232#endif
233
234 if (c_associated(this%gs_stream)) then
235 this%gs_stream = c_null_ptr
236 end if
237
238 end subroutine gs_device_free
239
241 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
242 integer, intent(in) :: m
243 integer, intent(in) :: n
244 integer, intent(in) :: nb
245 class(gs_device_t), intent(inout) :: this
246 real(kind=rp), dimension(m), intent(inout) :: v
247 integer, dimension(m), intent(inout) :: dg
248 real(kind=rp), dimension(n), intent(inout) :: u
249 integer, dimension(m), intent(inout) :: gd
250 integer, dimension(nb), intent(inout) :: b
251 integer, dimension(nb), intent(inout) :: bo
252 integer, intent(in) :: o
253 integer, intent(in) :: op
254 logical, intent(in) :: shrd
255 integer :: i
256 type(c_ptr) :: u_d
257
258 u_d = device_get_ptr(u)
259
260 if (.not. shrd) then
261 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
262 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
263 bo_d => this%local_blk_off_d, strm => this%gs_stream)
264
265 if (.not. c_associated(v_d)) then
266 call device_map(v, v_d, m)
267 block
268 real(c_rp) :: rp_dummy
269 integer(c_size_t) :: s
270 s = c_sizeof(rp_dummy) * m
271 call device_memset(v_d, 0, s, strm = strm)
272 end block
273 end if
274
275 if (.not. c_associated(dg_d)) then
276 call device_map(dg, dg_d, m)
277 call device_memcpy(dg, dg_d, m, host_to_device, &
278 sync = .false., strm = strm)
279 end if
280
281 if (.not. c_associated(gd_d)) then
282 call device_map(gd, gd_d, m)
283 call device_memcpy(gd, gd_d, m, host_to_device, &
284 sync = .false., strm = strm)
285 end if
286
287 if (nb .gt. 0) then
288 if (.not. c_associated(b_d)) then
289 call device_map(b, b_d, nb)
290 call device_memcpy(b, b_d, nb, host_to_device, &
291 sync = .false., strm = strm)
292 end if
293
294 if (.not. c_associated(bo_d)) then
295 call device_map(bo, bo_d, nb)
296 call device_memcpy(bo, bo_d, nb, host_to_device, &
297 sync = .false., strm = strm)
298 end if
299 end if
300
301#ifdef HAVE_HIP
302 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
303 nb, b_d, bo_d, op, strm)
304#elif HAVE_CUDA
305 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
306 nb, b_d, bo_d, op, strm)
307#elif HAVE_OPENCL
308 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
309 nb, b_d, bo_d, op, strm)
310#elif HAVE_METAL
311 call metal_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
312 nb, b_d, bo_d, op, strm)
313#else
314 call neko_error('No device backend configured')
315#endif
316
317 end associate
318 else if (shrd) then
319 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
320 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
321 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
322
323 if (.not. c_associated(v_d)) then
324 call device_map(v, v_d, m)
325 block
326 real(c_rp) :: rp_dummy
327 integer(c_size_t) :: s
328 s = c_sizeof(rp_dummy) * m
329 call device_memset(v_d, 0, s, strm = strm)
330 end block
331 end if
332
333 if (.not. c_associated(dg_d)) then
334 call device_map(dg, dg_d, m)
335 call device_memcpy(dg, dg_d, m, host_to_device, &
336 sync = .false., strm = strm)
337 end if
338
339 if (.not. c_associated(gd_d)) then
340 call device_map(gd, gd_d, m)
341 call device_memcpy(gd, gd_d, m, host_to_device, &
342 sync = .false., strm = strm)
343 end if
344
345 if (nb .gt. 0) then
346 if (.not. c_associated(b_d)) then
347 call device_map(b, b_d, nb)
348 call device_memcpy(b, b_d, nb, host_to_device, &
349 sync = .false., strm = strm)
350 end if
351
352 if (.not. c_associated(bo_d)) then
353 call device_map(bo, bo_d, nb)
354 call device_memcpy(bo, bo_d, nb, host_to_device, &
355 sync = .false., strm = strm)
356 end if
357 end if
358
359
360#ifdef HAVE_HIP
361 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
362 nb, b_d, bo_d, op, strm)
363#elif HAVE_CUDA
364 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
365 nb, b_d, bo_d, op, strm)
366#elif HAVE_OPENCL
367 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
368 nb, b_d, bo_d, op, strm)
369#elif HAVE_METAL
370 call metal_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
371 nb, b_d, bo_d, op, strm)
372#else
373 call neko_error('No device backend configured')
374#endif
375
376#if defined(HAVE_HIP) || defined(HAVE_CUDA) || \
377 defined(have_opencl) || defined(have_metal)
378 call device_event_record(this%gather_event, strm)
379#endif
380
381 if (this%shared_on_host) then
382 if (this%nshared .eq. m) then
383 call device_memcpy(v, v_d, m, device_to_host, &
384 sync = .true., strm = strm)
385 end if
386 end if
387
388 end associate
389 end if
390
391 end subroutine gs_gather_device
392
394 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
395 integer, intent(in) :: m
396 integer, intent(in) :: n
397 integer, intent(in) :: nb
398 class(gs_device_t), intent(inout) :: this
399 real(kind=rp), dimension(m), intent(inout) :: v
400 integer, dimension(m), intent(inout) :: dg
401 real(kind=rp), dimension(n), intent(inout) :: u
402 integer, dimension(m), intent(inout) :: gd
403 integer, dimension(nb), intent(inout) :: b
404 integer, dimension(nb), intent(inout) :: bo
405 logical, intent(in) :: shrd
406 type(c_ptr) :: event
407 type(c_ptr) :: u_d
408
409 u_d = device_get_ptr(u)
410
411 if (.not. shrd) then
412 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
413 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
414 bo_d => this%local_blk_off_d, strm => this%gs_stream)
415#ifdef HAVE_HIP
416 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
417 strm)
418#elif HAVE_CUDA
419 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
420 strm)
421#elif HAVE_OPENCL
422 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
423 bo_d, strm)
424#elif HAVE_METAL
425 call metal_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
426 bo_d, strm)
427#else
428 call neko_error('No device backend configured')
429#endif
430 end associate
431 else if (shrd) then
432 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
433 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
434 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
435
436 if (this%shared_on_host) then
437 call device_memcpy(v, v_d, m, host_to_device, &
438 sync = .false., strm = strm)
439 end if
440
441#ifdef HAVE_HIP
442 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
443 strm)
444#elif HAVE_CUDA
445 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
446 strm)
447#elif HAVE_OPENCL
448 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
449 bo_d, strm)
450#elif HAVE_METAL
451 call metal_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
452 bo_d, strm)
453#else
454 call neko_error('No device backend configured')
455#endif
456 end associate
457 end if
458
459#if defined(HAVE_HIP) || defined(HAVE_CUDA) || \
460 defined(have_opencl) || defined(have_metal)
461 if (c_associated(event)) then
462 call device_event_record(event, this%gs_stream)
463 else
464 call device_sync(this%gs_stream)
465 end if
466#endif
467
468
469 end subroutine gs_scatter_device
470
471end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
Definition gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
Definition gs.c:170
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96
Return the device pointer for an associated Fortran array.
Definition device.F90:108
Map a Fortran array to a device (allocate and associate)
Definition device.F90:78
Copy data between host and device (or device and device)
Definition device.F90:72
Synchronize a device or stream.
Definition device.F90:114
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1571
integer, parameter, public host_to_device
Definition device.F90:48
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:240
integer, parameter, public device_to_host
Definition device.F90:48
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1550
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:52
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1516
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Definition device.F90:263
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
Gather kernel.
subroutine gs_device_free(this)
Dummy backend deallocation.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
Scatter kernel.
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48