Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
gs_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2022, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use neko_config
36 use num_types
37 use gs_bcknd
38 use device
39 use gs_ops
40 use utils
41 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
42 c_associated
43 implicit none
44 private
45
47 type, public, extends(gs_bcknd_t) :: gs_device_t
48 integer, allocatable :: local_blk_off(:)
49 integer, allocatable :: shared_blk_off(:)
50 type(c_ptr) :: local_gs_d = c_null_ptr
51 type(c_ptr) :: local_dof_gs_d = c_null_ptr
52 type(c_ptr) :: local_gs_dof_d = c_null_ptr
53 type(c_ptr) :: shared_gs_d = c_null_ptr
54 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56 type(c_ptr) :: local_blk_len_d = c_null_ptr
57 type(c_ptr) :: shared_blk_len_d = c_null_ptr
58 type(c_ptr) :: local_blk_off_d = c_null_ptr
59 type(c_ptr) :: shared_blk_off_d = c_null_ptr
60 integer :: nlocal
61 integer :: nshared
62 logical :: shared_on_host
63 contains
64 procedure, pass(this) :: init => gs_device_init
65 procedure, pass(this) :: free => gs_device_free
66 procedure, pass(this) :: gather => gs_gather_device
67 procedure, pass(this) :: scatter => gs_scatter_device
68 end type gs_device_t
69
70#ifdef HAVE_HIP
71 interface
72 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73 bind(c, name='hip_gather_kernel')
74 use, intrinsic :: iso_c_binding
75 implicit none
76 integer(c_int) :: m, n, nb, o, op
77 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
78 end subroutine hip_gather_kernel
79 end interface
80
81 interface
82 subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
83 bind(c, name='hip_scatter_kernel')
84 use, intrinsic :: iso_c_binding
85 implicit none
86 integer(c_int) :: m, n, nb
87 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
88 end subroutine hip_scatter_kernel
89 end interface
90
91#elif HAVE_CUDA
92 interface
93 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94 bind(c, name='cuda_gather_kernel')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 integer(c_int) :: m, n, nb, o, op
98 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
99 end subroutine cuda_gather_kernel
100 end interface
101
102 interface
103 subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
104 bind(c, name='cuda_scatter_kernel')
105 use, intrinsic :: iso_c_binding
106 implicit none
107 integer(c_int) :: m, n, nb
108 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
109 end subroutine cuda_scatter_kernel
110 end interface
111#elif HAVE_OPENCL
112 interface
113 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114 bind(c, name='opencl_gather_kernel')
115 use, intrinsic :: iso_c_binding
116 implicit none
117 integer(c_int) :: m, n, nb, o, op
118 type(c_ptr), value :: v, u, dg, gd, b, bo
119 end subroutine opencl_gather_kernel
120 end interface
121
122 interface
123 subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo) &
124 bind(c, name='opencl_scatter_kernel')
125 use, intrinsic :: iso_c_binding
126 implicit none
127 integer(c_int) :: m, n, nb
128 type(c_ptr), value :: v, u, dg, gd, b, bo
129 end subroutine opencl_scatter_kernel
130 end interface
131#endif
132
133contains
134
136 subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
137 class(gs_device_t), intent(inout) :: this
138 integer, intent(in) :: nlocal
139 integer, intent(in) :: nshared
140 integer, intent(in) :: nlcl_blks
141 integer, intent(in) :: nshrd_blks
142
143 call this%free()
144
145 this%nlocal = nlocal
146 this%nshared = nshared
147
148 allocate(this%local_blk_off(nlcl_blks))
149 allocate(this%shared_blk_off(nshrd_blks))
150
151 this%local_gs_d = c_null_ptr
152 this%local_dof_gs_d = c_null_ptr
153 this%local_gs_dof_d = c_null_ptr
154 this%local_blk_len_d = c_null_ptr
155 this%local_blk_off_d = c_null_ptr
156 this%shared_gs_d = c_null_ptr
157 this%shared_dof_gs_d = c_null_ptr
158 this%shared_gs_dof_d = c_null_ptr
159 this%shared_blk_len_d = c_null_ptr
160 this%shared_blk_off_d = c_null_ptr
161
162 this%shared_on_host = .true.
163
164#if defined(HAVE_HIP) || defined(HAVE_CUDA)
165 call device_event_create(this%gather_event, 2)
166 call device_event_create(this%scatter_event, 2)
167#endif
168
169 this%gs_stream = glb_cmd_queue
170
171 end subroutine gs_device_init
172
174 subroutine gs_device_free(this)
175 class(gs_device_t), intent(inout) :: this
176
177 if (allocated(this%local_blk_off)) then
178 deallocate(this%local_blk_off)
179 end if
180
181 if (allocated(this%shared_blk_off)) then
182 deallocate(this%shared_blk_off)
183 end if
184
185 if (c_associated(this%local_gs_d)) then
186 call device_free(this%local_gs_d)
187 end if
188
189 if (c_associated(this%local_dof_gs_d)) then
190 call device_free(this%local_dof_gs_d)
191 end if
192
193 if (c_associated(this%local_gs_dof_d)) then
194 call device_free(this%local_gs_dof_d)
195 end if
196
197 if (c_associated(this%local_blk_len_d)) then
198 call device_free(this%local_blk_len_d)
199 end if
200
201 if (c_associated(this%shared_blk_len_d)) then
202 call device_free(this%shared_blk_len_d)
203 end if
204
205 if (c_associated(this%local_blk_off_d)) then
206 call device_free(this%local_blk_off_d)
207 end if
208
209 if (c_associated(this%shared_blk_off_d)) then
210 call device_free(this%shared_blk_off_d)
211 end if
212
213 this%nlocal = 0
214 this%nshared = 0
215
216#if defined(HAVE_HIP) || defined(HAVE_CUDA)
217 if (c_associated(this%gather_event)) then
218 call device_event_destroy(this%gather_event)
219 end if
220
221 if (c_associated(this%scatter_event)) then
222 call device_event_destroy(this%scatter_event)
223 end if
224#endif
225
226 if (c_associated(this%gs_stream)) then
227 this%gs_stream = c_null_ptr
228 end if
229
230 end subroutine gs_device_free
231
233 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
234 integer, intent(in) :: m
235 integer, intent(in) :: n
236 integer, intent(in) :: nb
237 class(gs_device_t), intent(inout) :: this
238 real(kind=rp), dimension(m), intent(inout) :: v
239 integer, dimension(m), intent(inout) :: dg
240 real(kind=rp), dimension(n), intent(inout) :: u
241 integer, dimension(m), intent(inout) :: gd
242 integer, dimension(nb), intent(inout) :: b
243 integer, intent(in) :: o
244 integer, intent(in) :: op
245 logical, intent(in) :: shrd
246 integer :: i
247 type(c_ptr) :: u_d
248
249 u_d = device_get_ptr(u)
250
251 if (.not. shrd) then
252 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
253 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
254 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
255 strm=>this%gs_stream)
256
257 if (.not. c_associated(v_d)) then
258 call device_map(v, v_d, m)
259 end if
260
261 if (.not. c_associated(dg_d)) then
262 call device_map(dg, dg_d, m)
263 call device_memcpy(dg, dg_d, m, host_to_device, &
264 sync=.false., strm=strm)
265 end if
266
267 if (.not. c_associated(gd_d)) then
268 call device_map(gd, gd_d, m)
269 call device_memcpy(gd, gd_d, m, host_to_device, &
270 sync=.false., strm=strm)
271 end if
272
273 if (.not. c_associated(b_d)) then
274 call device_map(b, b_d, nb)
275 call device_memcpy(b, b_d, nb, host_to_device, &
276 sync=.false., strm=strm)
277 end if
278
279 if (.not. c_associated(bo_d)) then
280 call device_map(bo, bo_d, nb)
281 bo(1) = 0
282 do i = 2, nb
283 bo(i) = bo(i - 1) + b(i - 1)
284 end do
285 call device_memcpy(bo, bo_d, nb, host_to_device, &
286 sync=.false., strm=strm)
287 end if
288
289#ifdef HAVE_HIP
290 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
291 nb, b_d, bo_d, op, strm)
292#elif HAVE_CUDA
293 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
294 nb, b_d, bo_d, op, strm)
295#elif HAVE_OPENCL
296 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
297 nb, b_d, bo_d, op)
298#else
299 call neko_error('No device backend configured')
300#endif
301
302 end associate
303 else if (shrd) then
304 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
305 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
306 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
307 strm=>this%gs_stream)
308
309 if (.not. c_associated(v_d)) then
310 call device_map(v, v_d, m)
311 end if
312
313 if (.not. c_associated(dg_d)) then
314 call device_map(dg, dg_d, m)
315 call device_memcpy(dg, dg_d, m, host_to_device, &
316 sync=.false., strm=strm)
317 end if
318
319 if (.not. c_associated(gd_d)) then
320 call device_map(gd, gd_d, m)
321 call device_memcpy(gd, gd_d, m, host_to_device, &
322 sync=.false., strm=strm)
323 end if
324
325 if (.not. c_associated(b_d)) then
326 call device_map(b, b_d, nb)
327 call device_memcpy(b, b_d, nb, host_to_device, &
328 sync=.false., strm=strm)
329 end if
330
331 if (.not. c_associated(bo_d)) then
332 call device_map(bo, bo_d, nb)
333 bo(1) = 0
334 do i = 2, nb
335 bo(i) = bo(i - 1) + b(i - 1)
336 end do
337 call device_memcpy(bo, bo_d, nb, host_to_device, &
338 sync=.false., strm=strm)
339 end if
340
341
342#ifdef HAVE_HIP
343 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
344 nb, b_d, bo_d, op, strm)
345#elif HAVE_CUDA
346 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
347 nb, b_d, bo_d, op, strm)
348#elif HAVE_OPENCL
349 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
350 nb, b_d, bo_d, op)
351#else
352 call neko_error('No device backend configured')
353#endif
354
355#if defined(HAVE_HIP) || defined(HAVE_CUDA)
356 call device_event_record(this%gather_event, strm)
357#endif
358
359 if (this%shared_on_host) then
360 if (this%nshared .eq. m) then
361 call device_memcpy(v, v_d, m, device_to_host, &
362 sync=.true., strm=strm)
363 end if
364 end if
365
366 end associate
367 end if
368
369 end subroutine gs_gather_device
370
372 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
373 integer, intent(in) :: m
374 integer, intent(in) :: n
375 integer, intent(in) :: nb
376 class(gs_device_t), intent(inout) :: this
377 real(kind=rp), dimension(m), intent(inout) :: v
378 integer, dimension(m), intent(inout) :: dg
379 real(kind=rp), dimension(n), intent(inout) :: u
380 integer, dimension(m), intent(inout) :: gd
381 integer, dimension(nb), intent(inout) :: b
382 logical, intent(in) :: shrd
383 type(c_ptr) :: event
384 type(c_ptr) :: u_d
385
386 u_d = device_get_ptr(u)
387
388 if (.not. shrd) then
389 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
390 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
391 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
392#ifdef HAVE_HIP
393 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
394#elif HAVE_CUDA
395 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
396#elif HAVE_OPENCL
397 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
398#else
399 call neko_error('No device backend configured')
400#endif
401 end associate
402 else if (shrd) then
403 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
404 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
405 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
406
407 if (this%shared_on_host) then
408 call device_memcpy(v, v_d, m, host_to_device, &
409 sync=.false., strm=strm)
410 end if
411
412#ifdef HAVE_HIP
413 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
414#elif HAVE_CUDA
415 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
416#elif HAVE_OPENCL
417 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
418#else
419 call neko_error('No device backend configured')
420#endif
421
422#if defined(HAVE_HIP) || defined(HAVE_CUDA)
423 if (c_associated(event)) then
424 call device_event_record(event, strm)
425 else
426 call device_sync(strm)
427 end if
428#endif
429
430 end associate
431 end if
432
433 end subroutine gs_scatter_device
434
435end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
Definition gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
Definition gs.c:166
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96
Return the device pointer for an associated Fortran array.
Definition device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition device.F90:57
Copy data between host and device (or device and device)
Definition device.F90:51
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1210
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:185
integer, parameter, public device_to_host
Definition device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1194
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1164
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
Defines Gather-scatter operations.
Definition gs_ops.f90:34
Build configurations.
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition gs_device.F90:47