48 use mpi_f08,
only : mpi_in_place, mpi_allreduce, &
50 use,
intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
51 c_associated, c_size_t, c_sizeof, c_int, c_loc
59 real(kind=
rp),
allocatable :: w1(:)
60 real(kind=
rp),
allocatable :: w2(:)
61 real(kind=
rp),
allocatable :: w3(:)
62 real(kind=
rp),
allocatable :: r1(:)
63 real(kind=
rp),
allocatable :: r2(:)
64 real(kind=
rp),
allocatable :: r3(:)
65 real(kind=
rp),
allocatable :: z1(:)
66 real(kind=
rp),
allocatable :: z2(:)
67 real(kind=
rp),
allocatable :: z3(:)
68 real(kind=
rp),
allocatable :: tmp(:)
69 real(kind=
rp),
allocatable :: p1(:,:)
70 real(kind=
rp),
allocatable :: p2(:,:)
71 real(kind=
rp),
allocatable :: p3(:,:)
72 real(kind=
rp),
allocatable :: alpha(:)
73 type(c_ptr) :: w1_d = c_null_ptr
74 type(c_ptr) :: w2_d = c_null_ptr
75 type(c_ptr) :: w3_d = c_null_ptr
76 type(c_ptr) :: r1_d = c_null_ptr
77 type(c_ptr) :: r2_d = c_null_ptr
78 type(c_ptr) :: r3_d = c_null_ptr
79 type(c_ptr) :: z1_d = c_null_ptr
80 type(c_ptr) :: z2_d = c_null_ptr
81 type(c_ptr) :: z3_d = c_null_ptr
82 type(c_ptr) :: alpha_d = c_null_ptr
83 type(c_ptr) :: p1_d_d = c_null_ptr
84 type(c_ptr) :: p2_d_d = c_null_ptr
85 type(c_ptr) :: p3_d_d = c_null_ptr
86 type(c_ptr) :: tmp_d = c_null_ptr
87 type(c_ptr),
allocatable :: p1_d(:)
88 type(c_ptr),
allocatable :: p2_d(:)
89 type(c_ptr),
allocatable :: p3_d(:)
90 type(c_ptr) :: gs_event1 = c_null_ptr
91 type(c_ptr) :: gs_event2 = c_null_ptr
92 type(c_ptr) :: gs_event3 = c_null_ptr
103 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
104 use,
intrinsic :: iso_c_binding
107 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
114 po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
115 use,
intrinsic :: iso_c_binding
118 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
119 type(c_ptr),
value :: po1_d, po2_d, po3_d
127 alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
128 use,
intrinsic :: iso_c_binding
130 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
131 integer(c_int) :: p_cur, n
137 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
138 bind(c, name=
'cuda_fusedcg_cpld_part2')
139 use,
intrinsic :: iso_c_binding
142 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
143 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
145 integer(c_int) :: n, p_cur
151 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='hip_fusedcg_cpld_part1')
152 use,
intrinsic :: iso_c_binding
155 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
162 po1_d, po2_d, po3_d, beta, n) bind(c, name='hip_fusedcg_cpld_update_p')
163 use,
intrinsic :: iso_c_binding
166 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
167 type(c_ptr),
value :: po1_d, po2_d, po3_d
175 alpha, p_cur, n) bind(c, name='hip_fusedcg_cpld_update_x')
176 use,
intrinsic :: iso_c_binding
178 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
179 integer(c_int) :: p_cur, n
185 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
186 bind(c, name=
'hip_fusedcg_cpld_part2')
187 use,
intrinsic :: iso_c_binding
190 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
191 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
193 integer(c_int) :: n, p_cur
201 b1_d, b2_d, b3_d, tmp_d, n)
202 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
203 type(c_ptr),
value :: tmp_d
210 call neko_error(
'No device backend configured')
215 po1_d, po2_d, po3_d, beta, n)
216 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
217 type(c_ptr),
value :: po1_d, po2_d, po3_d
222 po1_d, po2_d, po3_d, beta, n)
225 po1_d, po2_d, po3_d, beta, n)
227 call neko_error(
'No device backend configured')
232 p1_d, p2_d, p3_d, alpha, p_cur, n)
233 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
234 integer(c_int) :: p_cur, n
237 p1_d, p2_d, p3_d, alpha, p_cur, n)
240 p1_d, p2_d, p3_d, alpha, p_cur, n)
242 call neko_error(
'No device backend configured')
247 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
result(res)
248 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
249 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
256 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
259 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
261 call neko_error(
'No device backend configured')
264#ifndef HAVE_DEVICE_MPI
266 call mpi_allreduce(mpi_in_place, res, 1, &
275 rel_tol, abs_tol, monitor)
277 class(
pc_t),
optional,
intent(in),
target :: M
278 integer,
intent(in) :: n
279 integer,
intent(in) :: max_iter
280 real(kind=
rp),
optional,
intent(in) :: rel_tol
281 real(kind=
rp),
optional,
intent(in) :: abs_tol
282 logical,
optional,
intent(in) :: monitor
284 integer(c_size_t) :: p_size
298 allocate(this%tmp(n))
323 this%p1_d(i) = c_null_ptr
324 call device_map(this%p1(:,i), this%p1_d(i), n)
326 this%p2_d(i) = c_null_ptr
327 call device_map(this%p2(:,i), this%p2_d(i), n)
329 this%p3_d(i) = c_null_ptr
330 call device_map(this%p3(:,i), this%p3_d(i), n)
337 ptr = c_loc(this%p1_d)
340 ptr = c_loc(this%p2_d)
343 ptr = c_loc(this%p3_d)
346 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
347 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
348 else if (
present(rel_tol) .and.
present(abs_tol))
then
349 call this%ksp_init(max_iter, rel_tol, abs_tol)
350 else if (
present(monitor) .and.
present(abs_tol))
then
351 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
352 else if (
present(rel_tol) .and.
present(monitor))
then
353 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
354 else if (
present(rel_tol))
then
355 call this%ksp_init(max_iter, rel_tol = rel_tol)
356 else if (
present(abs_tol))
then
357 call this%ksp_init(max_iter, abs_tol = abs_tol)
358 else if (
present(monitor))
then
359 call this%ksp_init(max_iter, monitor = monitor)
361 call this%ksp_init(max_iter)
377 if (
allocated(this%w1))
then
381 if (
allocated(this%w2))
then
385 if (
allocated(this%w3))
then
389 if (
allocated(this%r1))
then
393 if (
allocated(this%r2))
then
397 if (
allocated(this%r3))
then
401 if (
allocated(this%z1))
then
405 if (
allocated(this%z2))
then
409 if (
allocated(this%z3))
then
413 if (
allocated(this%tmp))
then
417 if (
allocated(this%alpha))
then
418 deallocate(this%alpha)
421 if (
allocated(this%p1))
then
425 if (
allocated(this%p2))
then
429 if (
allocated(this%p3))
then
433 if (c_associated(this%w1_d))
then
437 if (c_associated(this%w2_d))
then
441 if (c_associated(this%w3_d))
then
445 if (c_associated(this%r1_d))
then
449 if (c_associated(this%r2_d))
then
453 if (c_associated(this%r3_d))
then
457 if (c_associated(this%z1_d))
then
461 if (c_associated(this%z2_d))
then
465 if (c_associated(this%z3_d))
then
469 if (c_associated(this%alpha_d))
then
473 if (c_associated(this%tmp_d))
then
477 if (
allocated(this%p1_d))
then
479 if (c_associated(this%p1_d(i)))
then
485 if (
allocated(this%p2_d))
then
487 if (c_associated(this%p2_d(i)))
then
493 if (
allocated(this%p3_d))
then
495 if (c_associated(this%p3_d(i)))
then
503 if (c_associated(this%gs_event1))
then
507 if (c_associated(this%gs_event2))
then
511 if (c_associated(this%gs_event3))
then
519 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
521 class(
ax_t),
intent(in) :: ax
522 type(
field_t),
intent(inout) :: x
523 type(
field_t),
intent(inout) :: y
524 type(
field_t),
intent(inout) :: z
525 integer,
intent(in) :: n
526 real(kind=
rp),
dimension(n),
intent(in) :: fx
527 real(kind=
rp),
dimension(n),
intent(in) :: fy
528 real(kind=
rp),
dimension(n),
intent(in) :: fz
529 type(
coef_t),
intent(inout) :: coef
533 type(
gs_t),
intent(inout) :: gs_h
535 integer,
optional,
intent(in) :: niter
536 integer :: iter, max_iter, ierr, i, p_cur, p_prev
537 real(kind=
rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
538 real(kind=
rp) :: pap, beta
547 if (
present(niter))
then
552 norm_fac = 1.0_rp / sqrt(coef%volume)
554 associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
555 r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
556 p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
557 tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
558 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
559 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
560 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
561 p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
562 p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
580 r2_d, r3_d, tmp_d, n)
584 rnorm = sqrt(rtr)*norm_fac
585 ksp_results%res_start = rnorm
586 ksp_results%res_final = rnorm
587 ksp_results(1)%iter = 0
588 ksp_results(2:3)%iter = -1
589 if(
abscmp(rnorm, 0.0_rp))
then
590 ksp_results%converged = .true.
593 call this%monitor_start(
'fcpldCG')
594 do iter = 1, max_iter
595 call this%M%solve(z1, r1, n)
596 call this%M%solve(z2, r2, n)
597 call this%M%solve(z3, r3, n)
600 r1_d, r2_d, r3_d, tmp_d, n)
604 if (iter .eq. 1) beta = 0.0_rp
607 z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
609 call ax%compute_vector(w1, w2, w3, &
610 p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
611 call gs_h%op(w1, n, gs_op_add, this%gs_event1)
613 call blstx%apply(w1, n)
614 call gs_h%op(w2, n, gs_op_add, this%gs_event2)
616 call blsty%apply(w2, n)
617 call gs_h%op(w3, n, gs_op_add, this%gs_event3)
619 call blstz%apply(w3, n)
622 p2_d(p_cur), p3_d(p_cur), tmp_d, n)
626 alpha(p_cur) = rtz1 / pap
628 w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
629 rnorm = sqrt(rtr)*norm_fac
630 call this%monitor_iter(iter, rnorm)
632 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
634 p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
637 if (rnorm .lt. this%abs_tol)
exit
643 call this%monitor_stop()
644 ksp_results%res_final = rnorm
645 ksp_results%iter = iter
646 ksp_results%converged = this%is_converged(iter, rnorm)
654 gs_h, niter)
result(ksp_results)
656 class(
ax_t),
intent(in) :: ax
657 type(
field_t),
intent(inout) :: x
658 integer,
intent(in) :: n
659 real(kind=
rp),
dimension(n),
intent(in) :: f
660 type(
coef_t),
intent(inout) :: coef
662 type(
gs_t),
intent(inout) :: gs_h
664 integer,
optional,
intent(in) :: niter
667 call neko_error(
'The cpldcg solver is only defined for coupled solves')
669 ksp_results%res_final = 0.0
671 ksp_results%converged = .false.
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_cpld_update_x(void *x1, void *x2, void *x3, void *p1, void *p2, void *p3, void *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_update_p(void *p1, void *p2, void *p3, void *z1, void *z2, void *z3, void *po1, void *po2, void *po3, real *beta, int *n)
real hip_fusedcg_cpld_part2(void *a1, void *a2, void *a3, void *b, void *c1, void *c2, void *c3, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_part1(void *a1, void *a2, void *a3, void *b1, void *b2, void *b3, void *tmp, int *n)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
integer, public pe_size
MPI size of communicator.
type(mpi_comm), public neko_comm
MPI communicator.
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.