Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only : rp, c_rp
36 use gs_bcknd, only : gs_bcknd_t
41 use utils, only : neko_error
42 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
43 c_associated, c_sizeof, c_size_t
44 implicit none
45 private
46
48 type, public, extends(gs_bcknd_t) :: gs_device_t
49 type(c_ptr) :: local_gs_d = c_null_ptr
50 type(c_ptr) :: local_dof_gs_d = c_null_ptr
51 type(c_ptr) :: local_gs_dof_d = c_null_ptr
52 type(c_ptr) :: shared_gs_d = c_null_ptr
53 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
54 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
55 type(c_ptr) :: local_blk_len_d = c_null_ptr
56 type(c_ptr) :: shared_blk_len_d = c_null_ptr
57 type(c_ptr) :: local_blk_off_d = c_null_ptr
58 type(c_ptr) :: shared_blk_off_d = c_null_ptr
59 integer :: nlocal
60 integer :: nshared
61 logical :: shared_on_host
62 contains
63 procedure, pass(this) :: init => gs_device_init
64 procedure, pass(this) :: free => gs_device_free
65 procedure, pass(this) :: gather => gs_gather_device
66 procedure, pass(this) :: scatter => gs_scatter_device
67 end type gs_device_t
68
69#ifdef HAVE_HIP
70 interface
71 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
72 bind(c, name = 'hip_gather_kernel')
73 use, intrinsic :: iso_c_binding
74 implicit none
75 integer(c_int) :: m, n, nb, o, op
76 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
77 end subroutine hip_gather_kernel
78 end interface
79
80 interface
81 subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
82 bind(c, name = 'hip_scatter_kernel')
83 use, intrinsic :: iso_c_binding
84 implicit none
85 integer(c_int) :: m, n, nb
86 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
87 end subroutine hip_scatter_kernel
88 end interface
89
90#elif HAVE_CUDA
91 interface
92 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
93 bind(c, name = 'cuda_gather_kernel')
94 use, intrinsic :: iso_c_binding
95 implicit none
96 integer(c_int) :: m, n, nb, o, op
97 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
98 end subroutine cuda_gather_kernel
99 end interface
100
101 interface
102 subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
103 bind(c, name = 'cuda_scatter_kernel')
104 use, intrinsic :: iso_c_binding
105 implicit none
106 integer(c_int) :: m, n, nb
107 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
108 end subroutine cuda_scatter_kernel
109 end interface
110#elif HAVE_OPENCL
111 interface
112 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, &
113 strm) bind(c, name = 'opencl_gather_kernel')
114 use, intrinsic :: iso_c_binding
115 implicit none
116 integer(c_int) :: m, n, nb, o, op
117 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
118 end subroutine opencl_gather_kernel
119 end interface
120
121 interface
122 subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
123 bind(c, name = 'opencl_scatter_kernel')
124 use, intrinsic :: iso_c_binding
125 implicit none
126 integer(c_int) :: m, n, nb
127 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
128 end subroutine opencl_scatter_kernel
129 end interface
130#endif
131
132contains
133
135 subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
136 class(gs_device_t), intent(inout) :: this
137 integer, intent(in) :: nlocal
138 integer, intent(in) :: nshared
139 integer, intent(in) :: nlcl_blks
140 integer, intent(in) :: nshrd_blks
141
142 call this%free()
143
144 this%nlocal = nlocal
145 this%nshared = nshared
146
147 this%local_gs_d = c_null_ptr
148 this%local_dof_gs_d = c_null_ptr
149 this%local_gs_dof_d = c_null_ptr
150 this%local_blk_len_d = c_null_ptr
151 this%local_blk_off_d = c_null_ptr
152 this%shared_gs_d = c_null_ptr
153 this%shared_dof_gs_d = c_null_ptr
154 this%shared_gs_dof_d = c_null_ptr
155 this%shared_blk_len_d = c_null_ptr
156 this%shared_blk_off_d = c_null_ptr
157
158 this%shared_on_host = .true.
159
160#if defined(HAVE_HIP) || defined(HAVE_CUDA)
161 call device_event_create(this%gather_event, 2)
162 call device_event_create(this%scatter_event, 2)
163#endif
164
165 this%gs_stream = glb_cmd_queue
166
167 end subroutine gs_device_init
168
170 subroutine gs_device_free(this)
171 class(gs_device_t), intent(inout) :: this
172
173 if (c_associated(this%local_gs_d)) then
174 call device_free(this%local_gs_d)
175 end if
176
177 if (c_associated(this%local_dof_gs_d)) then
178 call device_free(this%local_dof_gs_d)
179 end if
180
181 if (c_associated(this%local_gs_dof_d)) then
182 call device_free(this%local_gs_dof_d)
183 end if
184
185 if (c_associated(this%local_blk_len_d)) then
186 call device_free(this%local_blk_len_d)
187 end if
188
189 if (c_associated(this%shared_blk_len_d)) then
190 call device_free(this%shared_blk_len_d)
191 end if
192
193 if (c_associated(this%local_blk_off_d)) then
194 call device_free(this%local_blk_off_d)
195 end if
196
197 if (c_associated(this%shared_blk_off_d)) then
198 call device_free(this%shared_blk_off_d)
199 end if
200
201 this%nlocal = 0
202 this%nshared = 0
203
204#if defined(HAVE_HIP) || defined(HAVE_CUDA)
205 if (c_associated(this%gather_event)) then
206 call device_event_destroy(this%gather_event)
207 end if
208
209 if (c_associated(this%scatter_event)) then
210 call device_event_destroy(this%scatter_event)
211 end if
212#endif
213
214 if (c_associated(this%gs_stream)) then
215 this%gs_stream = c_null_ptr
216 end if
217
218 end subroutine gs_device_free
219
221 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
222 integer, intent(in) :: m
223 integer, intent(in) :: n
224 integer, intent(in) :: nb
225 class(gs_device_t), intent(inout) :: this
226 real(kind=rp), dimension(m), intent(inout) :: v
227 integer, dimension(m), intent(inout) :: dg
228 real(kind=rp), dimension(n), intent(inout) :: u
229 integer, dimension(m), intent(inout) :: gd
230 integer, dimension(nb), intent(inout) :: b
231 integer, dimension(nb), intent(inout) :: bo
232 integer, intent(in) :: o
233 integer, intent(in) :: op
234 logical, intent(in) :: shrd
235 integer :: i
236 type(c_ptr) :: u_d
237
238 u_d = device_get_ptr(u)
239
240 if (.not. shrd) then
241 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
242 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
243 bo_d => this%local_blk_off_d, strm => this%gs_stream)
244
245 if (.not. c_associated(v_d)) then
246 call device_map(v, v_d, m)
247 block
248 real(c_rp) :: rp_dummy
249 integer(c_size_t) :: s
250 s = c_sizeof(rp_dummy) * m
251 call device_memset(v_d, 0, s, strm = strm)
252 end block
253 end if
254
255 if (.not. c_associated(dg_d)) then
256 call device_map(dg, dg_d, m)
257 call device_memcpy(dg, dg_d, m, host_to_device, &
258 sync = .false., strm = strm)
259 end if
260
261 if (.not. c_associated(gd_d)) then
262 call device_map(gd, gd_d, m)
263 call device_memcpy(gd, gd_d, m, host_to_device, &
264 sync = .false., strm = strm)
265 end if
266
267 if (nb .gt. 0) then
268 if (.not. c_associated(b_d)) then
269 call device_map(b, b_d, nb)
270 call device_memcpy(b, b_d, nb, host_to_device, &
271 sync = .false., strm = strm)
272 end if
273
274 if (.not. c_associated(bo_d)) then
275 call device_map(bo, bo_d, nb)
276 call device_memcpy(bo, bo_d, nb, host_to_device, &
277 sync = .false., strm = strm)
278 end if
279 end if
280
281#ifdef HAVE_HIP
282 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
283 nb, b_d, bo_d, op, strm)
284#elif HAVE_CUDA
285 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
286 nb, b_d, bo_d, op, strm)
287#elif HAVE_OPENCL
288 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
289 nb, b_d, bo_d, op, strm)
290#else
291 call neko_error('No device backend configured')
292#endif
293
294 end associate
295 else if (shrd) then
296 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
297 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
298 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
299
300 if (.not. c_associated(v_d)) then
301 call device_map(v, v_d, m)
302 block
303 real(c_rp) :: rp_dummy
304 integer(c_size_t) :: s
305 s = c_sizeof(rp_dummy) * m
306 call device_memset(v_d, 0, s, strm = strm)
307 end block
308 end if
309
310 if (.not. c_associated(dg_d)) then
311 call device_map(dg, dg_d, m)
312 call device_memcpy(dg, dg_d, m, host_to_device, &
313 sync = .false., strm = strm)
314 end if
315
316 if (.not. c_associated(gd_d)) then
317 call device_map(gd, gd_d, m)
318 call device_memcpy(gd, gd_d, m, host_to_device, &
319 sync = .false., strm = strm)
320 end if
321
322 if (nb .gt. 0) then
323 if (.not. c_associated(b_d)) then
324 call device_map(b, b_d, nb)
325 call device_memcpy(b, b_d, nb, host_to_device, &
326 sync = .false., strm = strm)
327 end if
328
329 if (.not. c_associated(bo_d)) then
330 call device_map(bo, bo_d, nb)
331 call device_memcpy(bo, bo_d, nb, host_to_device, &
332 sync = .false., strm = strm)
333 end if
334 end if
335
336
337#ifdef HAVE_HIP
338 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
339 nb, b_d, bo_d, op, strm)
340#elif HAVE_CUDA
341 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
342 nb, b_d, bo_d, op, strm)
343#elif HAVE_OPENCL
344 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
345 nb, b_d, bo_d, op, strm)
346#else
347 call neko_error('No device backend configured')
348#endif
349
350#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
351 call device_event_record(this%gather_event, strm)
352#endif
353
354 if (this%shared_on_host) then
355 if (this%nshared .eq. m) then
356 call device_memcpy(v, v_d, m, device_to_host, &
357 sync = .true., strm = strm)
358 end if
359 end if
360
361 end associate
362 end if
363
364 end subroutine gs_gather_device
365
367 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
368 integer, intent(in) :: m
369 integer, intent(in) :: n
370 integer, intent(in) :: nb
371 class(gs_device_t), intent(inout) :: this
372 real(kind=rp), dimension(m), intent(inout) :: v
373 integer, dimension(m), intent(inout) :: dg
374 real(kind=rp), dimension(n), intent(inout) :: u
375 integer, dimension(m), intent(inout) :: gd
376 integer, dimension(nb), intent(inout) :: b
377 integer, dimension(nb), intent(inout) :: bo
378 logical, intent(in) :: shrd
379 type(c_ptr) :: event
380 type(c_ptr) :: u_d
381
382 u_d = device_get_ptr(u)
383
384 if (.not. shrd) then
385 associate(v_d => this%local_gs_d, dg_d => this%local_dof_gs_d, &
386 gd_d => this%local_gs_dof_d, b_d => this%local_blk_len_d, &
387 bo_d => this%local_blk_off_d, strm => this%gs_stream)
388#ifdef HAVE_HIP
389 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
390 strm)
391#elif HAVE_CUDA
392 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
393 strm)
394#elif HAVE_OPENCL
395 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
396 bo_d, strm)
397#else
398 call neko_error('No device backend configured')
399#endif
400 end associate
401 else if (shrd) then
402 associate(v_d => this%shared_gs_d, dg_d => this%shared_dof_gs_d, &
403 gd_d => this%shared_gs_dof_d, b_d => this%shared_blk_len_d, &
404 bo_d => this%shared_blk_off_d, strm => this%gs_stream)
405
406 if (this%shared_on_host) then
407 call device_memcpy(v, v_d, m, host_to_device, &
408 sync = .false., strm = strm)
409 end if
410
411#ifdef HAVE_HIP
412 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
413 strm)
414#elif HAVE_CUDA
415 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, &
416 strm)
417#elif HAVE_OPENCL
418 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, &
419 bo_d, strm)
420#else
421 call neko_error('No device backend configured')
422#endif
423 end associate
424 end if
425
426#if defined(HAVE_HIP) || defined(HAVE_CUDA) || defined(HAVE_OPENCL)
427 if (c_associated(event)) then
428 call device_event_record(event, this%gs_stream)
429 else
430 call device_sync(this%gs_stream)
431 end if
432#endif
433
434
435 end subroutine gs_scatter_device
436
437end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, void *cmd_queue)
Definition gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, void *cmd_queue)
Definition gs.c:170
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96
Return the device pointer for an associated Fortran array.
Definition device.F90:101
Map a Fortran array to a device (allocate and associate)
Definition device.F90:77
Copy data between host and device (or device and device)
Definition device.F90:71
Synchronize a device or stream.
Definition device.F90:107
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1295
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
integer, parameter, public device_to_host
Definition device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1279
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
subroutine, public device_memset(x_d, v, s, sync, strm)
Set memory on the device to a value.
Definition device.F90:238
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, bo, op, shrd)
Gather kernel.
subroutine gs_device_free(this)
Dummy backend deallocation.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, bo, shrd, event)
Scatter kernel.
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition gs_device.F90:48