54 real(kind=
rp),
allocatable :: w1(:)
55 real(kind=
rp),
allocatable :: w2(:)
56 real(kind=
rp),
allocatable :: w3(:)
57 real(kind=
rp),
allocatable :: r1(:)
58 real(kind=
rp),
allocatable :: r2(:)
59 real(kind=
rp),
allocatable :: r3(:)
60 real(kind=
rp),
allocatable :: z1(:)
61 real(kind=
rp),
allocatable :: z2(:)
62 real(kind=
rp),
allocatable :: z3(:)
63 real(kind=
rp),
allocatable :: tmp(:)
64 real(kind=
rp),
allocatable :: p1(:,:)
65 real(kind=
rp),
allocatable :: p2(:,:)
66 real(kind=
rp),
allocatable :: p3(:,:)
67 real(kind=
rp),
allocatable :: alpha(:)
68 type(c_ptr) :: w1_d = c_null_ptr
69 type(c_ptr) :: w2_d = c_null_ptr
70 type(c_ptr) :: w3_d = c_null_ptr
71 type(c_ptr) :: r1_d = c_null_ptr
72 type(c_ptr) :: r2_d = c_null_ptr
73 type(c_ptr) :: r3_d = c_null_ptr
74 type(c_ptr) :: z1_d = c_null_ptr
75 type(c_ptr) :: z2_d = c_null_ptr
76 type(c_ptr) :: z3_d = c_null_ptr
77 type(c_ptr) :: alpha_d = c_null_ptr
78 type(c_ptr) :: p1_d_d = c_null_ptr
79 type(c_ptr) :: p2_d_d = c_null_ptr
80 type(c_ptr) :: p3_d_d = c_null_ptr
81 type(c_ptr) :: tmp_d = c_null_ptr
82 type(c_ptr),
allocatable :: p1_d(:)
83 type(c_ptr),
allocatable :: p2_d(:)
84 type(c_ptr),
allocatable :: p3_d(:)
85 type(c_ptr) :: gs_event1 = c_null_ptr
86 type(c_ptr) :: gs_event2 = c_null_ptr
87 type(c_ptr) :: gs_event3 = c_null_ptr
98 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
99 use,
intrinsic :: iso_c_binding
102 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
109 po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
110 use,
intrinsic :: iso_c_binding
113 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
114 type(c_ptr),
value :: po1_d, po2_d, po3_d
122 alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
123 use,
intrinsic :: iso_c_binding
125 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
126 integer(c_int) :: p_cur, n
132 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
133 bind(c, name=
'cuda_fusedcg_cpld_part2')
134 use,
intrinsic :: iso_c_binding
137 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
138 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
140 integer(c_int) :: n, p_cur
146 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='hip_fusedcg_cpld_part1')
147 use,
intrinsic :: iso_c_binding
150 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
157 po1_d, po2_d, po3_d, beta, n) bind(c, name='hip_fusedcg_cpld_update_p')
158 use,
intrinsic :: iso_c_binding
161 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
162 type(c_ptr),
value :: po1_d, po2_d, po3_d
170 alpha, p_cur, n) bind(c, name='hip_fusedcg_cpld_update_x')
171 use,
intrinsic :: iso_c_binding
173 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
174 integer(c_int) :: p_cur, n
180 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
181 bind(c, name=
'hip_fusedcg_cpld_part2')
182 use,
intrinsic :: iso_c_binding
185 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
186 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
188 integer(c_int) :: n, p_cur
196 b1_d, b2_d, b3_d, tmp_d, n)
197 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
198 type(c_ptr),
value :: tmp_d
205 call neko_error(
'No device backend configured')
210 po1_d, po2_d, po3_d, beta, n)
211 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
212 type(c_ptr),
value :: po1_d, po2_d, po3_d
217 po1_d, po2_d, po3_d, beta, n)
220 po1_d, po2_d, po3_d, beta, n)
222 call neko_error(
'No device backend configured')
227 p1_d, p2_d, p3_d, alpha, p_cur, n)
228 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
229 integer(c_int) :: p_cur, n
232 p1_d, p2_d, p3_d, alpha, p_cur, n)
235 p1_d, p2_d, p3_d, alpha, p_cur, n)
237 call neko_error(
'No device backend configured')
242 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
result(res)
243 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
244 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
251 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
254 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
256 call neko_error(
'No device backend configured')
259#ifndef HAVE_DEVICE_MPI
261 call mpi_allreduce(mpi_in_place, res, 1, &
270 rel_tol, abs_tol, monitor)
272 class(
pc_t),
optional,
intent(in),
target :: M
273 integer,
intent(in) :: n
274 integer,
intent(in) :: max_iter
275 real(kind=
rp),
optional,
intent(in) :: rel_tol
276 real(kind=
rp),
optional,
intent(in) :: abs_tol
277 logical,
optional,
intent(in) :: monitor
279 integer(c_size_t) :: p_size
293 allocate(this%tmp(n))
318 this%p1_d(i) = c_null_ptr
319 call device_map(this%p1(:,i), this%p1_d(i), n)
321 this%p2_d(i) = c_null_ptr
322 call device_map(this%p2(:,i), this%p2_d(i), n)
324 this%p3_d(i) = c_null_ptr
325 call device_map(this%p3(:,i), this%p3_d(i), n)
332 ptr = c_loc(this%p1_d)
335 ptr = c_loc(this%p2_d)
338 ptr = c_loc(this%p3_d)
341 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
342 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
343 else if (
present(rel_tol) .and.
present(abs_tol))
then
344 call this%ksp_init(max_iter, rel_tol, abs_tol)
345 else if (
present(monitor) .and.
present(abs_tol))
then
346 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
347 else if (
present(rel_tol) .and.
present(monitor))
then
348 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
349 else if (
present(rel_tol))
then
350 call this%ksp_init(max_iter, rel_tol = rel_tol)
351 else if (
present(abs_tol))
then
352 call this%ksp_init(max_iter, abs_tol = abs_tol)
353 else if (
present(monitor))
then
354 call this%ksp_init(max_iter, monitor = monitor)
356 call this%ksp_init(max_iter)
372 if (
allocated(this%w1))
then
376 if (
allocated(this%w2))
then
380 if (
allocated(this%w3))
then
384 if (
allocated(this%r1))
then
388 if (
allocated(this%r2))
then
392 if (
allocated(this%r3))
then
396 if (
allocated(this%z1))
then
400 if (
allocated(this%z2))
then
404 if (
allocated(this%z3))
then
408 if (
allocated(this%tmp))
then
412 if (
allocated(this%alpha))
then
413 deallocate(this%alpha)
416 if (
allocated(this%p1))
then
420 if (
allocated(this%p2))
then
424 if (
allocated(this%p3))
then
428 if (c_associated(this%w1_d))
then
432 if (c_associated(this%w2_d))
then
436 if (c_associated(this%w3_d))
then
440 if (c_associated(this%r1_d))
then
444 if (c_associated(this%r2_d))
then
448 if (c_associated(this%r3_d))
then
452 if (c_associated(this%z1_d))
then
456 if (c_associated(this%z2_d))
then
460 if (c_associated(this%z3_d))
then
464 if (c_associated(this%alpha_d))
then
468 if (c_associated(this%tmp_d))
then
472 if (
allocated(this%p1_d))
then
474 if (c_associated(this%p1_d(i)))
then
480 if (
allocated(this%p2_d))
then
482 if (c_associated(this%p2_d(i)))
then
488 if (
allocated(this%p3_d))
then
490 if (c_associated(this%p3_d(i)))
then
498 if (c_associated(this%gs_event1))
then
502 if (c_associated(this%gs_event2))
then
506 if (c_associated(this%gs_event3))
then
514 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
516 class(
ax_t),
intent(in) :: ax
517 type(
field_t),
intent(inout) :: x
518 type(
field_t),
intent(inout) :: y
519 type(
field_t),
intent(inout) :: z
520 integer,
intent(in) :: n
521 real(kind=
rp),
dimension(n),
intent(in) :: fx
522 real(kind=
rp),
dimension(n),
intent(in) :: fy
523 real(kind=
rp),
dimension(n),
intent(in) :: fz
524 type(
coef_t),
intent(inout) :: coef
528 type(
gs_t),
intent(inout) :: gs_h
530 integer,
optional,
intent(in) :: niter
531 integer :: iter, max_iter, ierr, i, p_cur, p_prev
532 real(kind=
rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
533 real(kind=
rp) :: pap, beta
542 if (
present(niter))
then
547 norm_fac = 1.0_rp / sqrt(coef%volume)
549 associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
550 r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
551 p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
552 tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
553 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
554 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
555 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
556 p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
557 p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
575 r2_d, r3_d, tmp_d, n)
579 rnorm = sqrt(rtr)*norm_fac
580 ksp_results%res_start = rnorm
581 ksp_results%res_final = rnorm
582 ksp_results(1)%iter = 0
583 ksp_results(2:3)%iter = -1
584 if(
abscmp(rnorm, 0.0_rp))
return
585 call this%monitor_start(
'fcpldCG')
586 do iter = 1, max_iter
587 call this%M%solve(z1, r1, n)
588 call this%M%solve(z2, r2, n)
589 call this%M%solve(z3, r3, n)
592 r1_d, r2_d, r3_d, tmp_d, n)
596 if (iter .eq. 1) beta = 0.0_rp
599 z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
601 call ax%compute_vector(w1, w2, w3, &
602 p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
603 call gs_h%op(w1, n, gs_op_add, this%gs_event1)
604 call gs_h%op(w2, n, gs_op_add, this%gs_event2)
605 call gs_h%op(w3, n, gs_op_add, this%gs_event3)
609 call blstx%apply(w1, n)
610 call blsty%apply(w2, n)
611 call blstz%apply(w3, n)
614 p2_d(p_cur), p3_d(p_cur), tmp_d, n)
618 alpha(p_cur) = rtz1 / pap
620 w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
621 rnorm = sqrt(rtr)*norm_fac
622 call this%monitor_iter(iter, rnorm)
624 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
626 p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
629 if (rnorm .lt. this%abs_tol)
exit
635 call this%monitor_stop()
636 ksp_results%res_final = rnorm
637 ksp_results%iter = iter
638 ksp_results%converged = this%is_converged(iter, rnorm)
646 gs_h, niter)
result(ksp_results)
648 class(
ax_t),
intent(in) :: ax
649 type(
field_t),
intent(inout) :: x
650 integer,
intent(in) :: n
651 real(kind=
rp),
dimension(n),
intent(in) :: f
652 type(
coef_t),
intent(inout) :: coef
654 type(
gs_t),
intent(inout) :: gs_h
656 integer,
optional,
intent(in) :: niter
659 call neko_error(
'The cpldcg solver is only defined for coupled solves')
661 ksp_results%res_final = 0.0
663 ksp_results%converged = .false.
void hip_fusedcg_cpld_update_x(void *x1, void *x2, void *x3, void *p1, void *p2, void *p3, void *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_update_p(void *p1, void *p2, void *p3, void *z1, void *z2, void *z3, void *po1, void *po2, void *po3, real *beta, int *n)
real hip_fusedcg_cpld_part2(void *a1, void *a2, void *a3, void *b, void *c1, void *c2, void *c3, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_part1(void *a1, void *a2, void *a3, void *b1, void *b2, void *b3, void *tmp, int *n)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
type(mpi_comm) neko_comm
MPI communicator.
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
integer pe_size
MPI size of communicator.
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc2(a_d, b_d, n)
Weighted inner product .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.