Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
fusedcg_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2024, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
36 use precon, only : pc_t
37 use ax_product, only : ax_t
38 use num_types, only: rp, c_rp
39 use field, only : field_t
40 use coefs, only : coef_t
41 use gather_scatter, only : gs_t, gs_op_add
42 use bc_list, only : bc_list_t
43 use math, only : glsc3, rzero, copy, abscmp
48 use utils, only : neko_error
50 use mpi_f08, only : mpi_allreduce, mpi_in_place, mpi_sum
51 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
52 c_associated, c_size_t, c_sizeof, c_int, c_loc
53 implicit none
54 private
55
56 integer, parameter :: device_fusedcg_p_space = 10
57
59 type, public, extends(ksp_t) :: fusedcg_device_t
60 real(kind=rp), allocatable :: w(:)
61 real(kind=rp), allocatable :: r(:)
62 real(kind=rp), allocatable :: z(:)
63 real(kind=rp), allocatable :: p(:,:)
64 real(kind=rp), allocatable :: alpha(:)
65 type(c_ptr) :: w_d = c_null_ptr
66 type(c_ptr) :: r_d = c_null_ptr
67 type(c_ptr) :: z_d = c_null_ptr
68 type(c_ptr) :: alpha_d = c_null_ptr
69 type(c_ptr) :: p_d_d = c_null_ptr
70 type(c_ptr), allocatable :: p_d(:)
71 type(c_ptr) :: gs_event = c_null_ptr
72 contains
73 procedure, pass(this) :: init => fusedcg_device_init
74 procedure, pass(this) :: free => fusedcg_device_free
75 procedure, pass(this) :: solve => fusedcg_device_solve
76 procedure, pass(this) :: solve_coupled => fusedcg_device_solve_coupled
77 end type fusedcg_device_t
78
79#ifdef HAVE_CUDA
80 interface
81 subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
82 bind(c, name='cuda_fusedcg_update_p')
83 use, intrinsic :: iso_c_binding
84 import c_rp
85 implicit none
86 type(c_ptr), value :: p_d, z_d, po_d
87 real(c_rp) :: beta
88 integer(c_int) :: n
89 end subroutine cuda_fusedcg_update_p
90 end interface
91
92 interface
93 subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
94 bind(c, name='cuda_fusedcg_update_x')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 type(c_ptr), value :: x_d, p_d, alpha
98 integer(c_int) :: p_cur, n
99 end subroutine cuda_fusedcg_update_x
100 end interface
101
102 interface
103 real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
104 p_cur, n) bind(c, name='cuda_fusedcg_part2')
105 use, intrinsic :: iso_c_binding
106 import c_rp
107 implicit none
108 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
109 real(c_rp) :: alpha
110 integer(c_int) :: n, p_cur
111 end function cuda_fusedcg_part2
112 end interface
113#elif HAVE_HIP
114 interface
115 subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
116 bind(c, name='hip_fusedcg_update_p')
117 use, intrinsic :: iso_c_binding
118 import c_rp
119 implicit none
120 type(c_ptr), value :: p_d, z_d, po_d
121 real(c_rp) :: beta
122 integer(c_int) :: n
123 end subroutine hip_fusedcg_update_p
124 end interface
125
126 interface
127 subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
128 bind(c, name='hip_fusedcg_update_x')
129 use, intrinsic :: iso_c_binding
130 implicit none
131 type(c_ptr), value :: x_d, p_d, alpha
132 integer(c_int) :: p_cur, n
133 end subroutine hip_fusedcg_update_x
134 end interface
135
136 interface
137 real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
138 p_cur, n) bind(c, name='hip_fusedcg_part2')
139 use, intrinsic :: iso_c_binding
140 import c_rp
141 implicit none
142 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
143 real(c_rp) :: alpha
144 integer(c_int) :: n, p_cur
145 end function hip_fusedcg_part2
146 end interface
147#endif
148
149contains
150
151 subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
152 type(c_ptr), value :: p_d, z_d, po_d
153 real(c_rp) :: beta
154 integer(c_int) :: n
155#ifdef HAVE_HIP
156 call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
157#elif HAVE_CUDA
158 call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
159#else
160 call neko_error('No device backend configured')
161#endif
162 end subroutine device_fusedcg_update_p
163
164 subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
165 type(c_ptr), value :: x_d, p_d, alpha
166 integer(c_int) :: p_cur, n
167#ifdef HAVE_HIP
168 call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
169#elif HAVE_CUDA
170 call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
171#else
172 call neko_error('No device backend configured')
173#endif
174 end subroutine device_fusedcg_update_x
175
176 function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
177 p_cur, n) result(res)
178 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
179 real(c_rp) :: alpha
180 integer :: n, p_cur
181 real(kind=rp) :: res
182 integer :: ierr
183#ifdef HAVE_HIP
184 res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
185#elif HAVE_CUDA
186 res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
187#else
188 call neko_error('No device backend configured')
189#endif
190
191#ifndef HAVE_DEVICE_MPI
192 if (pe_size .gt. 1) then
193 call mpi_allreduce(mpi_in_place, res, 1, &
194 mpi_real_precision, mpi_sum, neko_comm, ierr)
195 end if
196#endif
197
198 end function device_fusedcg_part2
199
201 subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, &
202 monitor)
203 class(fusedcg_device_t), target, intent(inout) :: this
204 class(pc_t), optional, intent(in), target :: M
205 integer, intent(in) :: n
206 integer, intent(in) :: max_iter
207 real(kind=rp), optional, intent(in) :: rel_tol
208 real(kind=rp), optional, intent(in) :: abs_tol
209 logical, optional, intent(in) :: monitor
210 type(c_ptr) :: ptr
211 integer(c_size_t) :: p_size
212 integer :: i
213
214 call this%free()
215
216 allocate(this%w(n))
217 allocate(this%r(n))
218 allocate(this%z(n))
219 allocate(this%p(n, device_fusedcg_p_space))
220 allocate(this%p_d(device_fusedcg_p_space))
221 allocate(this%alpha(device_fusedcg_p_space))
222
223 if (present(m)) then
224 this%M => m
225 end if
226
227 call device_map(this%w, this%w_d, n)
228 call device_map(this%r, this%r_d, n)
229 call device_map(this%z, this%z_d, n)
230 call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
231 do i = 1, device_fusedcg_p_space+1
232 this%p_d(i) = c_null_ptr
233 call device_map(this%p(:,i), this%p_d(i), n)
234 end do
235
236 p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
237 call device_alloc(this%p_d_d, p_size)
238 ptr = c_loc(this%p_d)
239 call device_memcpy(ptr, this%p_d_d, p_size, &
240 host_to_device, sync=.false.)
241 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
242 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
243 else if (present(rel_tol) .and. present(abs_tol)) then
244 call this%ksp_init(max_iter, rel_tol, abs_tol)
245 else if (present(monitor) .and. present(abs_tol)) then
246 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
247 else if (present(rel_tol) .and. present(monitor)) then
248 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
249 else if (present(rel_tol)) then
250 call this%ksp_init(max_iter, rel_tol = rel_tol)
251 else if (present(abs_tol)) then
252 call this%ksp_init(max_iter, abs_tol = abs_tol)
253 else if (present(monitor)) then
254 call this%ksp_init(max_iter, monitor = monitor)
255 else
256 call this%ksp_init(max_iter)
257 end if
258
259 call device_event_create(this%gs_event, 2)
260
261 end subroutine fusedcg_device_init
262
264 subroutine fusedcg_device_free(this)
265 class(fusedcg_device_t), intent(inout) :: this
266 integer :: i
267
268 call this%ksp_free()
269
270 if (allocated(this%w)) then
271 if (c_associated(this%w_d)) then
272 call device_unmap(this%w, this%w_d)
273 end if
274 deallocate(this%w)
275 end if
276
277 if (allocated(this%r)) then
278 if (c_associated(this%r_d)) then
279 call device_unmap(this%r, this%r_d)
280 end if
281 deallocate(this%r)
282 end if
283
284
285 if (allocated(this%z)) then
286 if (c_associated(this%z_d)) then
287 call device_unmap(this%z, this%z_d)
288 end if
289 deallocate(this%z)
290 end if
291
292
293 if (allocated(this%alpha)) then
294 if (c_associated(this%alpha_d)) then
295 call device_unmap(this%alpha, this%alpha_d)
296 end if
297 deallocate(this%alpha)
298 end if
299
300 if (allocated(this%p)) then
301 if (allocated(this%p_d)) then
302 do i = 1, device_fusedcg_p_space
303 if (c_associated(this%p_d(i))) then
304 call device_unmap(this%p(:,i), this%p_d(i))
305 end if
306 end do
307 end if
308 deallocate(this%p)
309 end if
310
311 if (c_associated(this%p_d_d)) then
312 call device_free(this%p_d_d)
313 end if
314
315 nullify(this%M)
316
317 if (c_associated(this%gs_event)) then
318 call device_event_destroy(this%gs_event)
319 end if
320
321 end subroutine fusedcg_device_free
322
324 function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
325 class(fusedcg_device_t), intent(inout) :: this
326 class(ax_t), intent(in) :: ax
327 type(field_t), intent(inout) :: x
328 integer, intent(in) :: n
329 real(kind=rp), dimension(n), intent(in) :: f
330 type(coef_t), intent(inout) :: coef
331 type(bc_list_t), intent(inout) :: blst
332 type(gs_t), intent(inout) :: gs_h
333 type(ksp_monitor_t) :: ksp_results
334 integer, optional, intent(in) :: niter
335 integer :: iter, max_iter, ierr, i, p_cur, p_prev
336 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
337 real(kind=rp) :: pap, beta
338 type(c_ptr) :: f_d
339 f_d = device_get_ptr(f)
340
341 if (present(niter)) then
342 max_iter = niter
343 else
344 max_iter = ksp_max_iter
345 end if
346 norm_fac = 1.0_rp / sqrt(coef%volume)
347
348 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
349 alpha => this%alpha, alpha_d => this%alpha_d, &
350 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
351 p_d => this%p_d, p_d_d => this%p_d_d)
352
353 rtz1 = 1.0_rp
355 p_cur = 1
356 call device_rzero(x%x_d, n)
357 call device_rzero(p_d(1), n)
358 call device_copy(r_d, f_d, n)
359
360 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
361 rnorm = sqrt(rtr)*norm_fac
362 ksp_results%res_start = rnorm
363 ksp_results%res_final = rnorm
364 ksp_results%iter = 0
365 if(abscmp(rnorm, 0.0_rp)) then
366 ksp_results%converged = .true.
367 return
368 end if
369
370 call this%monitor_start('FusedCG')
371 do iter = 1, max_iter
372 call this%M%solve(z, r, n)
373 rtz2 = rtz1
374 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
375 beta = rtz1 / rtz2
376 if (iter .eq. 1) beta = 0.0_rp
377 call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
378
379 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
380 call gs_h%op(w, n, gs_op_add, this%gs_event)
381 call device_event_sync(this%gs_event)
382 call blst%apply(w, n)
383
384 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
385
386 alpha(p_cur) = rtz1 / pap
387 rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
388 alpha_d, alpha(p_cur), p_cur, n)
389 rnorm = sqrt(rtr)*norm_fac
390 call this%monitor_iter(iter, rnorm)
391 if ((p_cur .eq. device_fusedcg_p_space) .or. &
392 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
393 call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
394 p_prev = p_cur
395 p_cur = 1
396 if (rnorm .lt. this%abs_tol) exit
397 else
398 p_prev = p_cur
399 p_cur = p_cur + 1
400 end if
401 end do
402 call this%monitor_stop()
403 ksp_results%res_final = rnorm
404 ksp_results%iter = iter
405 ksp_results%converged = this%is_converged(iter, rnorm)
406
407 end associate
408
409 end function fusedcg_device_solve
410
412 function fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
413 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
414 class(fusedcg_device_t), intent(inout) :: this
415 class(ax_t), intent(in) :: ax
416 type(field_t), intent(inout) :: x
417 type(field_t), intent(inout) :: y
418 type(field_t), intent(inout) :: z
419 integer, intent(in) :: n
420 real(kind=rp), dimension(n), intent(in) :: fx
421 real(kind=rp), dimension(n), intent(in) :: fy
422 real(kind=rp), dimension(n), intent(in) :: fz
423 type(coef_t), intent(inout) :: coef
424 type(bc_list_t), intent(inout) :: blstx
425 type(bc_list_t), intent(inout) :: blsty
426 type(bc_list_t), intent(inout) :: blstz
427 type(gs_t), intent(inout) :: gs_h
428 type(ksp_monitor_t), dimension(3) :: ksp_results
429 integer, optional, intent(in) :: niter
430
431 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
432 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
433 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
434
436
437end module fusedcg_device
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Return the device pointer for an associated Fortran array.
Definition device.F90:108
Map a Fortran array to a device (allocate and associate)
Definition device.F90:78
Copy data between host and device (or device and device)
Definition device.F90:72
Unmap a Fortran array from a device (deassociate and free)
Definition device.F90:84
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Coefficients.
Definition coef.f90:34
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:53
integer, public pe_size
MPI size of communicator.
Definition comm.F90:61
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:45
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition device.F90:1594
integer, parameter, public host_to_device
Definition device.F90:48
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:240
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1550
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:209
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1516
Defines a field.
Definition field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition krylov.f90:51
Definition math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition math.f90:1285
subroutine, public copy(a, b, n)
Copy a vector .
Definition math.f90:289
subroutine, public rzero(a, n)
Zero a real vector.
Definition math.f90:233
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Utilities.
Definition utils.f90:35
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:49
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:63
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:73
Defines a canonical Krylov preconditioner.
Definition precon.f90:40