Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
fusedcg_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2024, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
36 use precon, only : pc_t
37 use ax_product, only : ax_t
38 use num_types, only: rp, c_rp
39 use field, only : field_t
40 use coefs, only : coef_t
41 use gather_scatter, only : gs_t, gs_op_add
42 use bc_list, only : bc_list_t
43 use math, only : glsc3, rzero, copy, abscmp
48 use utils, only : neko_error
50 use mpi_f08, only : mpi_allreduce, mpi_in_place, mpi_sum
51 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
52 c_associated, c_size_t, c_sizeof, c_int, c_loc
53 implicit none
54 private
55
56 integer, parameter :: device_fusedcg_p_space = 10
57
59 type, public, extends(ksp_t) :: fusedcg_device_t
60 real(kind=rp), allocatable :: w(:)
61 real(kind=rp), allocatable :: r(:)
62 real(kind=rp), allocatable :: z(:)
63 real(kind=rp), allocatable :: p(:,:)
64 real(kind=rp), allocatable :: alpha(:)
65 type(c_ptr) :: w_d = c_null_ptr
66 type(c_ptr) :: r_d = c_null_ptr
67 type(c_ptr) :: z_d = c_null_ptr
68 type(c_ptr) :: alpha_d = c_null_ptr
69 type(c_ptr) :: p_d_d = c_null_ptr
70 type(c_ptr), allocatable :: p_d(:)
71 type(c_ptr) :: gs_event = c_null_ptr
72 contains
73 procedure, pass(this) :: init => fusedcg_device_init
74 procedure, pass(this) :: free => fusedcg_device_free
75 procedure, pass(this) :: solve => fusedcg_device_solve
76 procedure, pass(this) :: solve_coupled => fusedcg_device_solve_coupled
77 end type fusedcg_device_t
78
79#ifdef HAVE_CUDA
80 interface
81 subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
82 bind(c, name='cuda_fusedcg_update_p')
83 use, intrinsic :: iso_c_binding
84 import c_rp
85 implicit none
86 type(c_ptr), value :: p_d, z_d, po_d
87 real(c_rp) :: beta
88 integer(c_int) :: n
89 end subroutine cuda_fusedcg_update_p
90 end interface
91
92 interface
93 subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
94 bind(c, name='cuda_fusedcg_update_x')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 type(c_ptr), value :: x_d, p_d, alpha
98 integer(c_int) :: p_cur, n
99 end subroutine cuda_fusedcg_update_x
100 end interface
101
102 interface
103 real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
104 p_cur, n) bind(c, name='cuda_fusedcg_part2')
105 use, intrinsic :: iso_c_binding
106 import c_rp
107 implicit none
108 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
109 real(c_rp) :: alpha
110 integer(c_int) :: n, p_cur
111 end function cuda_fusedcg_part2
112 end interface
113#elif HAVE_HIP
114 interface
115 subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
116 bind(c, name='hip_fusedcg_update_p')
117 use, intrinsic :: iso_c_binding
118 import c_rp
119 implicit none
120 type(c_ptr), value :: p_d, z_d, po_d
121 real(c_rp) :: beta
122 integer(c_int) :: n
123 end subroutine hip_fusedcg_update_p
124 end interface
125
126 interface
127 subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
128 bind(c, name='hip_fusedcg_update_x')
129 use, intrinsic :: iso_c_binding
130 implicit none
131 type(c_ptr), value :: x_d, p_d, alpha
132 integer(c_int) :: p_cur, n
133 end subroutine hip_fusedcg_update_x
134 end interface
135
136 interface
137 real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
138 p_cur, n) bind(c, name='hip_fusedcg_part2')
139 use, intrinsic :: iso_c_binding
140 import c_rp
141 implicit none
142 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
143 real(c_rp) :: alpha
144 integer(c_int) :: n, p_cur
145 end function hip_fusedcg_part2
146 end interface
147#endif
148
149contains
150
151 subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
152 type(c_ptr), value :: p_d, z_d, po_d
153 real(c_rp) :: beta
154 integer(c_int) :: n
155#ifdef HAVE_HIP
156 call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
157#elif HAVE_CUDA
158 call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
159#else
160 call neko_error('No device backend configured')
161#endif
162 end subroutine device_fusedcg_update_p
163
164 subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
165 type(c_ptr), value :: x_d, p_d, alpha
166 integer(c_int) :: p_cur, n
167#ifdef HAVE_HIP
168 call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
169#elif HAVE_CUDA
170 call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
171#else
172 call neko_error('No device backend configured')
173#endif
174 end subroutine device_fusedcg_update_x
175
176 function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
177 p_cur, n) result(res)
178 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
179 real(c_rp) :: alpha
180 integer :: n, p_cur
181 real(kind=rp) :: res
182 integer :: ierr
183#ifdef HAVE_HIP
184 res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
185#elif HAVE_CUDA
186 res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
187#else
188 call neko_error('No device backend configured')
189#endif
190
191#ifndef HAVE_DEVICE_MPI
192 if (pe_size .gt. 1) then
193 call mpi_allreduce(mpi_in_place, res, 1, &
194 mpi_real_precision, mpi_sum, neko_comm, ierr)
195 end if
196#endif
197
198 end function device_fusedcg_part2
199
201 subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, &
202 monitor)
203 class(fusedcg_device_t), target, intent(inout) :: this
204 class(pc_t), optional, intent(in), target :: M
205 integer, intent(in) :: n
206 integer, intent(in) :: max_iter
207 real(kind=rp), optional, intent(in) :: rel_tol
208 real(kind=rp), optional, intent(in) :: abs_tol
209 logical, optional, intent(in) :: monitor
210 type(c_ptr) :: ptr
211 integer(c_size_t) :: p_size
212 integer :: i
213
214 call this%free()
215
216 allocate(this%w(n))
217 allocate(this%r(n))
218 allocate(this%z(n))
219 allocate(this%p(n, device_fusedcg_p_space))
220 allocate(this%p_d(device_fusedcg_p_space))
221 allocate(this%alpha(device_fusedcg_p_space))
222
223 if (present(m)) then
224 this%M => m
225 end if
226
227 call device_map(this%w, this%w_d, n)
228 call device_map(this%r, this%r_d, n)
229 call device_map(this%z, this%z_d, n)
230 call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
231 do i = 1, device_fusedcg_p_space+1
232 this%p_d(i) = c_null_ptr
233 call device_map(this%p(:,i), this%p_d(i), n)
234 end do
235
236 p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
237 call device_alloc(this%p_d_d, p_size)
238 ptr = c_loc(this%p_d)
239 call device_memcpy(ptr, this%p_d_d, p_size, &
240 host_to_device, sync=.false.)
241 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
242 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
243 else if (present(rel_tol) .and. present(abs_tol)) then
244 call this%ksp_init(max_iter, rel_tol, abs_tol)
245 else if (present(monitor) .and. present(abs_tol)) then
246 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
247 else if (present(rel_tol) .and. present(monitor)) then
248 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
249 else if (present(rel_tol)) then
250 call this%ksp_init(max_iter, rel_tol = rel_tol)
251 else if (present(abs_tol)) then
252 call this%ksp_init(max_iter, abs_tol = abs_tol)
253 else if (present(monitor)) then
254 call this%ksp_init(max_iter, monitor = monitor)
255 else
256 call this%ksp_init(max_iter)
257 end if
258
259 call device_event_create(this%gs_event, 2)
260
261 end subroutine fusedcg_device_init
262
264 subroutine fusedcg_device_free(this)
265 class(fusedcg_device_t), intent(inout) :: this
266 integer :: i
267
268 call this%ksp_free()
269
270 if (allocated(this%w)) then
271 deallocate(this%w)
272 end if
273
274 if (allocated(this%r)) then
275 deallocate(this%r)
276 end if
277
278
279 if (allocated(this%z)) then
280 deallocate(this%z)
281 end if
282
283
284 if (allocated(this%alpha)) then
285 deallocate(this%alpha)
286 end if
287
288 if (allocated(this%p)) then
289 deallocate(this%p)
290 end if
291
292 if (c_associated(this%w_d)) then
293 call device_free(this%w_d)
294 end if
295
296 if (c_associated(this%r_d)) then
297 call device_free(this%r_d)
298 end if
299
300 if (c_associated(this%z_d)) then
301 call device_free(this%z_d)
302 end if
303
304 if (c_associated(this%alpha_d)) then
305 call device_free(this%alpha_d)
306 end if
307
308 if (allocated(this%p_d)) then
309 do i = 1, device_fusedcg_p_space
310 if (c_associated(this%p_d(i))) then
311 call device_free(this%p_d(i))
312 end if
313 end do
314 end if
315
316 nullify(this%M)
317
318 if (c_associated(this%gs_event)) then
319 call device_event_destroy(this%gs_event)
320 end if
321
322 end subroutine fusedcg_device_free
323
325 function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
326 class(fusedcg_device_t), intent(inout) :: this
327 class(ax_t), intent(in) :: ax
328 type(field_t), intent(inout) :: x
329 integer, intent(in) :: n
330 real(kind=rp), dimension(n), intent(in) :: f
331 type(coef_t), intent(inout) :: coef
332 type(bc_list_t), intent(inout) :: blst
333 type(gs_t), intent(inout) :: gs_h
334 type(ksp_monitor_t) :: ksp_results
335 integer, optional, intent(in) :: niter
336 integer :: iter, max_iter, ierr, i, p_cur, p_prev
337 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
338 real(kind=rp) :: pap, beta
339 type(c_ptr) :: f_d
340 f_d = device_get_ptr(f)
341
342 if (present(niter)) then
343 max_iter = niter
344 else
345 max_iter = ksp_max_iter
346 end if
347 norm_fac = 1.0_rp / sqrt(coef%volume)
348
349 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
350 alpha => this%alpha, alpha_d => this%alpha_d, &
351 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
352 p_d => this%p_d, p_d_d => this%p_d_d)
353
354 rtz1 = 1.0_rp
356 p_cur = 1
357 call device_rzero(x%x_d, n)
358 call device_rzero(p_d(1), n)
359 call device_copy(r_d, f_d, n)
360
361 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
362 rnorm = sqrt(rtr)*norm_fac
363 ksp_results%res_start = rnorm
364 ksp_results%res_final = rnorm
365 ksp_results%iter = 0
366 if(abscmp(rnorm, 0.0_rp)) then
367 ksp_results%converged = .true.
368 return
369 end if
370
371 call this%monitor_start('FusedCG')
372 do iter = 1, max_iter
373 call this%M%solve(z, r, n)
374 rtz2 = rtz1
375 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
376 beta = rtz1 / rtz2
377 if (iter .eq. 1) beta = 0.0_rp
378 call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
379
380 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
381 call gs_h%op(w, n, gs_op_add, this%gs_event)
382 call device_event_sync(this%gs_event)
383 call blst%apply(w, n)
384
385 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
386
387 alpha(p_cur) = rtz1 / pap
388 rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
389 alpha_d, alpha(p_cur), p_cur, n)
390 rnorm = sqrt(rtr)*norm_fac
391 call this%monitor_iter(iter, rnorm)
392 if ((p_cur .eq. device_fusedcg_p_space) .or. &
393 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
394 call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
395 p_prev = p_cur
396 p_cur = 1
397 if (rnorm .lt. this%abs_tol) exit
398 else
399 p_prev = p_cur
400 p_cur = p_cur + 1
401 end if
402 end do
403 call this%monitor_stop()
404 ksp_results%res_final = rnorm
405 ksp_results%iter = iter
406 ksp_results%converged = this%is_converged(iter, rnorm)
407
408 end associate
409
410 end function fusedcg_device_solve
411
413 function fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
414 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
415 class(fusedcg_device_t), intent(inout) :: this
416 class(ax_t), intent(in) :: ax
417 type(field_t), intent(inout) :: x
418 type(field_t), intent(inout) :: y
419 type(field_t), intent(inout) :: z
420 integer, intent(in) :: n
421 real(kind=rp), dimension(n), intent(in) :: fx
422 real(kind=rp), dimension(n), intent(in) :: fy
423 real(kind=rp), dimension(n), intent(in) :: fz
424 type(coef_t), intent(inout) :: coef
425 type(bc_list_t), intent(inout) :: blstx
426 type(bc_list_t), intent(inout) :: blsty
427 type(bc_list_t), intent(inout) :: blstz
428 type(gs_t), intent(inout) :: gs_h
429 type(ksp_monitor_t), dimension(3) :: ksp_results
430 integer, optional, intent(in) :: niter
431
432 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
433 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
434 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
435
437
438end module fusedcg_device
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Return the device pointer for an associated Fortran array.
Definition device.F90:96
Map a Fortran array to a device (allocate and associate)
Definition device.F90:72
Copy data between host and device (or device and device)
Definition device.F90:66
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Coefficients.
Definition coef.f90:34
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:50
integer, public pe_size
MPI size of communicator.
Definition comm.F90:58
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:42
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition device.F90:1309
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:214
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1274
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:187
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1244
Defines a field.
Definition field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition krylov.f90:51
Definition math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition math.f90:1026
subroutine, public copy(a, b, n)
Copy a vector .
Definition math.f90:255
subroutine, public rzero(a, n)
Zero a real vector.
Definition math.f90:211
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Utilities.
Definition utils.f90:35
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:48
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:55
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:73
Defines a canonical Krylov preconditioner.
Definition precon.f90:40