Loading [MathJax]/extensions/tex2jax.js
Neko 0.9.1
A portable framework for high-order spectral element flow simulations
All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros Pages
gs_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2022, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use neko_config
36 use num_types
37 use gs_bcknd
38 use device
39 use gs_ops
40 use utils
41 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_null_ptr, &
42 c_associated
43 implicit none
44 private
45
47 type, public, extends(gs_bcknd_t) :: gs_device_t
48 integer, allocatable :: local_blk_off(:)
49 integer, allocatable :: shared_blk_off(:)
50 type(c_ptr) :: local_gs_d = c_null_ptr
51 type(c_ptr) :: local_dof_gs_d = c_null_ptr
52 type(c_ptr) :: local_gs_dof_d = c_null_ptr
53 type(c_ptr) :: shared_gs_d = c_null_ptr
54 type(c_ptr) :: shared_dof_gs_d = c_null_ptr
55 type(c_ptr) :: shared_gs_dof_d = c_null_ptr
56 type(c_ptr) :: local_blk_len_d = c_null_ptr
57 type(c_ptr) :: shared_blk_len_d = c_null_ptr
58 type(c_ptr) :: local_blk_off_d = c_null_ptr
59 type(c_ptr) :: shared_blk_off_d = c_null_ptr
60 integer :: nlocal
61 integer :: nshared
62 logical :: shared_on_host
63 contains
64 procedure, pass(this) :: init => gs_device_init
65 procedure, pass(this) :: free => gs_device_free
66 procedure, pass(this) :: gather => gs_gather_device
67 procedure, pass(this) :: scatter => gs_scatter_device
68 end type gs_device_t
69
70#ifdef HAVE_HIP
71 interface
72 subroutine hip_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
73 bind(c, name='hip_gather_kernel')
74 use, intrinsic :: iso_c_binding
75 implicit none
76 integer(c_int) :: m, n, nb, o, op
77 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
78 end subroutine hip_gather_kernel
79 end interface
80
81 interface
82 subroutine hip_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
83 bind(c, name='hip_scatter_kernel')
84 use, intrinsic :: iso_c_binding
85 implicit none
86 integer(c_int) :: m, n, nb
87 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
88 end subroutine hip_scatter_kernel
89 end interface
90
91#elif HAVE_CUDA
92 interface
93 subroutine cuda_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op, strm) &
94 bind(c, name='cuda_gather_kernel')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 integer(c_int) :: m, n, nb, o, op
98 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
99 end subroutine cuda_gather_kernel
100 end interface
101
102 interface
103 subroutine cuda_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo, strm) &
104 bind(c, name='cuda_scatter_kernel')
105 use, intrinsic :: iso_c_binding
106 implicit none
107 integer(c_int) :: m, n, nb
108 type(c_ptr), value :: v, u, dg, gd, b, bo, strm
109 end subroutine cuda_scatter_kernel
110 end interface
111#elif HAVE_OPENCL
112 interface
113 subroutine opencl_gather_kernel(v, m, o, dg, u, n, gd, nb, b, bo, op) &
114 bind(c, name='opencl_gather_kernel')
115 use, intrinsic :: iso_c_binding
116 implicit none
117 integer(c_int) :: m, n, nb, o, op
118 type(c_ptr), value :: v, u, dg, gd, b, bo
119 end subroutine opencl_gather_kernel
120 end interface
121
122 interface
123 subroutine opencl_scatter_kernel(v, m, dg, u, n, gd, nb, b, bo) &
124 bind(c, name='opencl_scatter_kernel')
125 use, intrinsic :: iso_c_binding
126 implicit none
127 integer(c_int) :: m, n, nb
128 type(c_ptr), value :: v, u, dg, gd, b, bo
129 end subroutine opencl_scatter_kernel
130 end interface
131#endif
132
133contains
134
136 subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
137 class(gs_device_t), intent(inout) :: this
138 integer, intent(in) :: nlocal
139 integer, intent(in) :: nshared
140 integer, intent(in) :: nlcl_blks
141 integer, intent(in) :: nshrd_blks
142
143 call this%free()
144
145 this%nlocal = nlocal
146 this%nshared = nshared
147
148 allocate(this%local_blk_off(nlcl_blks))
149 allocate(this%shared_blk_off(nshrd_blks))
150
151 this%local_gs_d = c_null_ptr
152 this%local_dof_gs_d = c_null_ptr
153 this%local_gs_dof_d = c_null_ptr
154 this%local_blk_len_d = c_null_ptr
155 this%local_blk_off_d = c_null_ptr
156 this%shared_gs_d = c_null_ptr
157 this%shared_dof_gs_d = c_null_ptr
158 this%shared_gs_dof_d = c_null_ptr
159 this%shared_blk_len_d = c_null_ptr
160 this%shared_blk_off_d = c_null_ptr
161
162 this%shared_on_host = .true.
163
164 call device_event_create(this%gather_event, 2)
165 call device_event_create(this%scatter_event, 2)
166
167 this%gs_stream = glb_cmd_queue
168
169 end subroutine gs_device_init
170
172 subroutine gs_device_free(this)
173 class(gs_device_t), intent(inout) :: this
174
175 if (allocated(this%local_blk_off)) then
176 deallocate(this%local_blk_off)
177 end if
178
179 if (allocated(this%shared_blk_off)) then
180 deallocate(this%shared_blk_off)
181 end if
182
183 if (c_associated(this%local_gs_d)) then
184 call device_free(this%local_gs_d)
185 end if
186
187 if (c_associated(this%local_dof_gs_d)) then
188 call device_free(this%local_dof_gs_d)
189 end if
190
191 if (c_associated(this%local_gs_dof_d)) then
192 call device_free(this%local_gs_dof_d)
193 end if
194
195 if (c_associated(this%local_blk_len_d)) then
196 call device_free(this%local_blk_len_d)
197 end if
198
199 if (c_associated(this%shared_blk_len_d)) then
200 call device_free(this%shared_blk_len_d)
201 end if
202
203 if (c_associated(this%local_blk_off_d)) then
204 call device_free(this%local_blk_off_d)
205 end if
206
207 if (c_associated(this%shared_blk_off_d)) then
208 call device_free(this%shared_blk_off_d)
209 end if
210
211 this%nlocal = 0
212 this%nshared = 0
213
214 if (c_associated(this%gather_event)) then
215 call device_event_destroy(this%gather_event)
216 end if
217
218 if (c_associated(this%scatter_event)) then
219 call device_event_destroy(this%scatter_event)
220 end if
221
222 if (c_associated(this%gs_stream)) then
223 this%gs_stream = c_null_ptr
224 end if
225
226 end subroutine gs_device_free
227
229 subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
230 integer, intent(in) :: m
231 integer, intent(in) :: n
232 integer, intent(in) :: nb
233 class(gs_device_t), intent(inout) :: this
234 real(kind=rp), dimension(m), intent(inout) :: v
235 integer, dimension(m), intent(inout) :: dg
236 real(kind=rp), dimension(n), intent(inout) :: u
237 integer, dimension(m), intent(inout) :: gd
238 integer, dimension(nb), intent(inout) :: b
239 integer, intent(in) :: o
240 integer, intent(in) :: op
241 logical, intent(in) :: shrd
242 integer :: i
243 type(c_ptr) :: u_d
244
245 u_d = device_get_ptr(u)
246
247 if (.not. shrd) then
248 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
249 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
250 bo=>this%local_blk_off, bo_d=>this%local_blk_off_d, &
251 strm=>this%gs_stream)
252
253 if (.not. c_associated(v_d)) then
254 call device_map(v, v_d, m)
255 end if
256
257 if (.not. c_associated(dg_d)) then
258 call device_map(dg, dg_d, m)
259 call device_memcpy(dg, dg_d, m, host_to_device, &
260 sync=.false., strm=strm)
261 end if
262
263 if (.not. c_associated(gd_d)) then
264 call device_map(gd, gd_d, m)
265 call device_memcpy(gd, gd_d, m, host_to_device, &
266 sync=.false., strm=strm)
267 end if
268
269 if (.not. c_associated(b_d)) then
270 call device_map(b, b_d, nb)
271 call device_memcpy(b, b_d, nb, host_to_device, &
272 sync=.false., strm=strm)
273 end if
274
275 if (.not. c_associated(bo_d)) then
276 call device_map(bo, bo_d, nb)
277 bo(1) = 0
278 do i = 2, nb
279 bo(i) = bo(i - 1) + b(i - 1)
280 end do
281 call device_memcpy(bo, bo_d, nb, host_to_device, &
282 sync=.false., strm=strm)
283 end if
284
285#ifdef HAVE_HIP
286 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
287 nb, b_d, bo_d, op, strm)
288#elif HAVE_CUDA
289 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
290 nb, b_d, bo_d, op, strm)
291#elif HAVE_OPENCL
292 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
293 nb, b_d, bo_d, op)
294#else
295 call neko_error('No device backend configured')
296#endif
297
298 end associate
299 else if (shrd) then
300 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
301 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
302 bo=>this%shared_blk_off, bo_d=>this%shared_blk_off_d, &
303 strm=>this%gs_stream)
304
305 if (.not. c_associated(v_d)) then
306 call device_map(v, v_d, m)
307 end if
308
309 if (.not. c_associated(dg_d)) then
310 call device_map(dg, dg_d, m)
311 call device_memcpy(dg, dg_d, m, host_to_device, &
312 sync=.false., strm=strm)
313 end if
314
315 if (.not. c_associated(gd_d)) then
316 call device_map(gd, gd_d, m)
317 call device_memcpy(gd, gd_d, m, host_to_device, &
318 sync=.false., strm=strm)
319 end if
320
321 if (.not. c_associated(b_d)) then
322 call device_map(b, b_d, nb)
323 call device_memcpy(b, b_d, nb, host_to_device, &
324 sync=.false., strm=strm)
325 end if
326
327 if (.not. c_associated(bo_d)) then
328 call device_map(bo, bo_d, nb)
329 bo(1) = 0
330 do i = 2, nb
331 bo(i) = bo(i - 1) + b(i - 1)
332 end do
333 call device_memcpy(bo, bo_d, nb, host_to_device, &
334 sync=.false., strm=strm)
335 end if
336
337
338#ifdef HAVE_HIP
339 call hip_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
340 nb, b_d, bo_d, op, strm)
341#elif HAVE_CUDA
342 call cuda_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
343 nb, b_d, bo_d, op, strm)
344#elif HAVE_OPENCL
345 call opencl_gather_kernel(v_d, m, o, dg_d, u_d, n, gd_d, &
346 nb, b_d, bo_d, op)
347#else
348 call neko_error('No device backend configured')
349#endif
350
351 call device_event_record(this%gather_event, strm)
352
353 if (this%shared_on_host) then
354 if (this%nshared .eq. m) then
355 call device_memcpy(v, v_d, m, device_to_host, &
356 sync=.true., strm=strm)
357 end if
358 end if
359
360 end associate
361 end if
362
363 end subroutine gs_gather_device
364
366 subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
367 integer, intent(in) :: m
368 integer, intent(in) :: n
369 integer, intent(in) :: nb
370 class(gs_device_t), intent(inout) :: this
371 real(kind=rp), dimension(m), intent(inout) :: v
372 integer, dimension(m), intent(inout) :: dg
373 real(kind=rp), dimension(n), intent(inout) :: u
374 integer, dimension(m), intent(inout) :: gd
375 integer, dimension(nb), intent(inout) :: b
376 logical, intent(in) :: shrd
377 type(c_ptr) :: event
378 type(c_ptr) :: u_d
379
380 u_d = device_get_ptr(u)
381
382 if (.not. shrd) then
383 associate(v_d=>this%local_gs_d, dg_d=>this%local_dof_gs_d, &
384 gd_d=>this%local_gs_dof_d, b_d=>this%local_blk_len_d, &
385 bo_d=>this%local_blk_off_d, strm=>this%gs_stream)
386#ifdef HAVE_HIP
387 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
388#elif HAVE_CUDA
389 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
390#elif HAVE_OPENCL
391 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
392#else
393 call neko_error('No device backend configured')
394#endif
395 end associate
396 else if (shrd) then
397 associate(v_d=>this%shared_gs_d, dg_d=>this%shared_dof_gs_d, &
398 gd_d=>this%shared_gs_dof_d, b_d=>this%shared_blk_len_d, &
399 bo_d=>this%shared_blk_off_d, strm=>this%gs_stream)
400
401 if (this%shared_on_host) then
402 call device_memcpy(v, v_d, m, host_to_device, &
403 sync=.false., strm=strm)
404 end if
405
406#ifdef HAVE_HIP
407 call hip_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
408#elif HAVE_CUDA
409 call cuda_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d, strm)
410#elif HAVE_OPENCL
411 call opencl_scatter_kernel(v_d, m, dg_d, u_d, n, gd_d, nb, b_d, bo_d)
412#else
413 call neko_error('No device backend configured')
414#endif
415
416 if (c_associated(event)) then
417 call device_event_record(event, strm)
418 else
419 call device_sync(strm)
420 end if
421
422 end associate
423 end if
424
425 end subroutine gs_scatter_device
426
427end module gs_device
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
Definition gs.c:58
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
Definition gs.c:166
void cuda_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op, cudaStream_t stream)
Definition gs.cu:51
void cuda_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, cudaStream_t stream)
Definition gs.cu:96
Return the device pointer for an associated Fortran array.
Definition device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition device.F90:57
Copy data between host and device (or device and device)
Definition device.F90:51
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_record(event, stream)
Record a device event.
Definition device.F90:1210
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:185
integer, parameter, public device_to_host
Definition device.F90:47
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1194
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1164
Defines a gather-scatter backend.
Definition gs_bcknd.f90:34
Generic Gather-scatter backend for accelerators.
Definition gs_device.F90:34
subroutine gs_gather_device(this, v, m, o, dg, u, n, gd, nb, b, op, shrd)
Gather kernel.
subroutine gs_scatter_device(this, v, m, dg, u, n, gd, nb, b, shrd, event)
Scatter kernel.
subroutine gs_device_init(this, nlocal, nshared, nlcl_blks, nshrd_blks)
Accelerator backend initialisation.
subroutine gs_device_free(this)
Dummy backend deallocation.
Defines Gather-scatter operations.
Definition gs_ops.f90:34
Build configurations.
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35
Gather-scatter backend.
Definition gs_bcknd.f90:44
Gather-scatter backend for offloading devices.
Definition gs_device.F90:47