50 use mpi_f08,
only : mpi_allreduce, mpi_in_place, mpi_sum
51 use,
intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
52 c_associated, c_size_t, c_sizeof, c_int, c_loc
60 real(kind=
rp),
allocatable :: w(:)
61 real(kind=
rp),
allocatable :: r(:)
62 real(kind=
rp),
allocatable :: z(:)
63 real(kind=
rp),
allocatable :: p(:,:)
64 real(kind=
rp),
allocatable :: alpha(:)
65 type(c_ptr) :: w_d = c_null_ptr
66 type(c_ptr) :: r_d = c_null_ptr
67 type(c_ptr) :: z_d = c_null_ptr
68 type(c_ptr) :: alpha_d = c_null_ptr
69 type(c_ptr) :: p_d_d = c_null_ptr
70 type(c_ptr),
allocatable :: p_d(:)
71 type(c_ptr) :: gs_event = c_null_ptr
82 bind(c, name=
'cuda_fusedcg_update_p')
83 use,
intrinsic :: iso_c_binding
86 type(c_ptr),
value :: p_d, z_d, po_d
94 bind(c, name=
'cuda_fusedcg_update_x')
95 use,
intrinsic :: iso_c_binding
97 type(c_ptr),
value :: x_d, p_d, alpha
98 integer(c_int) :: p_cur, n
104 p_cur, n) bind(c, name=
'cuda_fusedcg_part2')
105 use,
intrinsic :: iso_c_binding
108 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
110 integer(c_int) :: n, p_cur
116 bind(c, name=
'hip_fusedcg_update_p')
117 use,
intrinsic :: iso_c_binding
120 type(c_ptr),
value :: p_d, z_d, po_d
128 bind(c, name=
'hip_fusedcg_update_x')
129 use,
intrinsic :: iso_c_binding
131 type(c_ptr),
value :: x_d, p_d, alpha
132 integer(c_int) :: p_cur, n
138 p_cur, n) bind(c, name=
'hip_fusedcg_part2')
139 use,
intrinsic :: iso_c_binding
142 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
144 integer(c_int) :: n, p_cur
152 type(c_ptr),
value :: p_d, z_d, po_d
160 call neko_error(
'No device backend configured')
165 type(c_ptr),
value :: x_d, p_d, alpha
166 integer(c_int) :: p_cur, n
172 call neko_error(
'No device backend configured')
177 p_cur, n)
result(res)
178 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
188 call neko_error(
'No device backend configured')
191#ifndef HAVE_DEVICE_MPI
192 if (pe_size .gt. 1)
then
193 call mpi_allreduce(mpi_in_place, res, 1, &
194 mpi_real_precision, mpi_sum, neko_comm, ierr)
204 class(pc_t),
optional,
intent(in),
target :: M
205 integer,
intent(in) :: n
206 integer,
intent(in) :: max_iter
207 real(kind=rp),
optional,
intent(in) :: rel_tol
208 real(kind=rp),
optional,
intent(in) :: abs_tol
209 logical,
optional,
intent(in) :: monitor
211 integer(c_size_t) :: p_size
227 call device_map(this%w, this%w_d, n)
228 call device_map(this%r, this%r_d, n)
229 call device_map(this%z, this%z_d, n)
232 this%p_d(i) = c_null_ptr
233 call device_map(this%p(:,i), this%p_d(i), n)
237 call device_alloc(this%p_d_d, p_size)
238 ptr = c_loc(this%p_d)
239 call device_memcpy(ptr, this%p_d_d, p_size, &
240 host_to_device, sync=.false.)
241 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
242 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
243 else if (
present(rel_tol) .and.
present(abs_tol))
then
244 call this%ksp_init(max_iter, rel_tol, abs_tol)
245 else if (
present(monitor) .and.
present(abs_tol))
then
246 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
247 else if (
present(rel_tol) .and.
present(monitor))
then
248 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
249 else if (
present(rel_tol))
then
250 call this%ksp_init(max_iter, rel_tol = rel_tol)
251 else if (
present(abs_tol))
then
252 call this%ksp_init(max_iter, abs_tol = abs_tol)
253 else if (
present(monitor))
then
254 call this%ksp_init(max_iter, monitor = monitor)
256 call this%ksp_init(max_iter)
259 call device_event_create(this%gs_event, 2)
270 if (
allocated(this%w))
then
274 if (
allocated(this%r))
then
279 if (
allocated(this%z))
then
284 if (
allocated(this%alpha))
then
285 deallocate(this%alpha)
288 if (
allocated(this%p))
then
292 if (c_associated(this%w_d))
then
293 call device_free(this%w_d)
296 if (c_associated(this%r_d))
then
297 call device_free(this%r_d)
300 if (c_associated(this%z_d))
then
301 call device_free(this%z_d)
304 if (c_associated(this%alpha_d))
then
305 call device_free(this%alpha_d)
308 if (
allocated(this%p_d))
then
310 if (c_associated(this%p_d(i)))
then
311 call device_free(this%p_d(i))
318 if (c_associated(this%gs_event))
then
319 call device_event_destroy(this%gs_event)
327 class(ax_t),
intent(in) :: ax
328 type(field_t),
intent(inout) :: x
329 integer,
intent(in) :: n
330 real(kind=rp),
dimension(n),
intent(in) :: f
331 type(coef_t),
intent(inout) :: coef
332 type(bc_list_t),
intent(inout) :: blst
333 type(gs_t),
intent(inout) :: gs_h
334 type(ksp_monitor_t) :: ksp_results
335 integer,
optional,
intent(in) :: niter
336 integer :: iter, max_iter, ierr, i, p_cur, p_prev
337 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
338 real(kind=rp) :: pap, beta
340 f_d = device_get_ptr(f)
342 if (
present(niter))
then
345 max_iter = ksp_max_iter
347 norm_fac = 1.0_rp / sqrt(coef%volume)
349 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
350 alpha => this%alpha, alpha_d => this%alpha_d, &
351 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
352 p_d => this%p_d, p_d_d => this%p_d_d)
357 call device_rzero(x%x_d, n)
358 call device_rzero(p_d(1), n)
359 call device_copy(r_d, f_d, n)
361 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
362 rnorm = sqrt(rtr)*norm_fac
363 ksp_results%res_start = rnorm
364 ksp_results%res_final = rnorm
366 if(abscmp(rnorm, 0.0_rp))
then
367 ksp_results%converged = .true.
371 call this%monitor_start(
'FusedCG')
372 do iter = 1, max_iter
373 call this%M%solve(z, r, n)
375 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
377 if (iter .eq. 1) beta = 0.0_rp
380 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
381 call gs_h%op(w, n, gs_op_add, this%gs_event)
382 call device_event_sync(this%gs_event)
383 call blst%apply(w, n)
385 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
387 alpha(p_cur) = rtz1 / pap
389 alpha_d, alpha(p_cur), p_cur, n)
390 rnorm = sqrt(rtr)*norm_fac
391 call this%monitor_iter(iter, rnorm)
393 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
397 if (rnorm .lt. this%abs_tol)
exit
403 call this%monitor_stop()
404 ksp_results%res_final = rnorm
405 ksp_results%iter = iter
406 ksp_results%converged = this%is_converged(iter, rnorm)
414 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
416 class(ax_t),
intent(in) :: ax
417 type(field_t),
intent(inout) :: x
418 type(field_t),
intent(inout) :: y
419 type(field_t),
intent(inout) :: z
420 integer,
intent(in) :: n
421 real(kind=rp),
dimension(n),
intent(in) :: fx
422 real(kind=rp),
dimension(n),
intent(in) :: fy
423 real(kind=rp),
dimension(n),
intent(in) :: fz
424 type(coef_t),
intent(inout) :: coef
425 type(bc_list_t),
intent(inout) :: blstx
426 type(bc_list_t),
intent(inout) :: blsty
427 type(bc_list_t),
intent(inout) :: blstz
428 type(gs_t),
intent(inout) :: gs_h
429 type(ksp_monitor_t),
dimension(3) :: ksp_results
430 integer,
optional,
intent(in) :: niter
432 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
433 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
434 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
integer, public pe_size
MPI size of communicator.
type(mpi_comm), public neko_comm
MPI communicator.
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.