48 use mpi_f08,
only : mpi_in_place, mpi_allreduce, &
51 use,
intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
52 c_associated, c_size_t, c_sizeof, c_int, c_loc
60 real(kind=
rp),
allocatable :: w1(:)
61 real(kind=
rp),
allocatable :: w2(:)
62 real(kind=
rp),
allocatable :: w3(:)
63 real(kind=
rp),
allocatable :: r1(:)
64 real(kind=
rp),
allocatable :: r2(:)
65 real(kind=
rp),
allocatable :: r3(:)
66 real(kind=
rp),
allocatable :: z1(:)
67 real(kind=
rp),
allocatable :: z2(:)
68 real(kind=
rp),
allocatable :: z3(:)
69 real(kind=
rp),
allocatable :: tmp(:)
70 real(kind=
rp),
allocatable :: p1(:,:)
71 real(kind=
rp),
allocatable :: p2(:,:)
72 real(kind=
rp),
allocatable :: p3(:,:)
73 real(kind=
rp),
allocatable :: alpha(:)
74 type(c_ptr) :: w1_d = c_null_ptr
75 type(c_ptr) :: w2_d = c_null_ptr
76 type(c_ptr) :: w3_d = c_null_ptr
77 type(c_ptr) :: r1_d = c_null_ptr
78 type(c_ptr) :: r2_d = c_null_ptr
79 type(c_ptr) :: r3_d = c_null_ptr
80 type(c_ptr) :: z1_d = c_null_ptr
81 type(c_ptr) :: z2_d = c_null_ptr
82 type(c_ptr) :: z3_d = c_null_ptr
83 type(c_ptr) :: alpha_d = c_null_ptr
84 type(c_ptr) :: p1_d_d = c_null_ptr
85 type(c_ptr) :: p2_d_d = c_null_ptr
86 type(c_ptr) :: p3_d_d = c_null_ptr
87 type(c_ptr) :: tmp_d = c_null_ptr
88 type(c_ptr),
allocatable :: p1_d(:)
89 type(c_ptr),
allocatable :: p2_d(:)
90 type(c_ptr),
allocatable :: p3_d(:)
91 type(c_ptr) :: gs_event1 = c_null_ptr
92 type(c_ptr) :: gs_event2 = c_null_ptr
93 type(c_ptr) :: gs_event3 = c_null_ptr
104 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
105 use,
intrinsic :: iso_c_binding
108 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
115 po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
116 use,
intrinsic :: iso_c_binding
119 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
120 type(c_ptr),
value :: po1_d, po2_d, po3_d
128 alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
129 use,
intrinsic :: iso_c_binding
131 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
132 integer(c_int) :: p_cur, n
138 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
139 bind(c, name=
'cuda_fusedcg_cpld_part2')
140 use,
intrinsic :: iso_c_binding
143 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
144 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
146 integer(c_int) :: n, p_cur
152 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='hip_fusedcg_cpld_part1')
153 use,
intrinsic :: iso_c_binding
156 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
163 po1_d, po2_d, po3_d, beta, n) bind(c, name='hip_fusedcg_cpld_update_p')
164 use,
intrinsic :: iso_c_binding
167 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
168 type(c_ptr),
value :: po1_d, po2_d, po3_d
176 alpha, p_cur, n) bind(c, name='hip_fusedcg_cpld_update_x')
177 use,
intrinsic :: iso_c_binding
179 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
180 integer(c_int) :: p_cur, n
186 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
187 bind(c, name=
'hip_fusedcg_cpld_part2')
188 use,
intrinsic :: iso_c_binding
191 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
192 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
194 integer(c_int) :: n, p_cur
202 b1_d, b2_d, b3_d, tmp_d, n)
203 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
204 type(c_ptr),
value :: tmp_d
211 call neko_error(
'No device backend configured')
216 po1_d, po2_d, po3_d, beta, n)
217 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
218 type(c_ptr),
value :: po1_d, po2_d, po3_d
223 po1_d, po2_d, po3_d, beta, n)
226 po1_d, po2_d, po3_d, beta, n)
228 call neko_error(
'No device backend configured')
233 p1_d, p2_d, p3_d, alpha, p_cur, n)
234 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
235 integer(c_int) :: p_cur, n
238 p1_d, p2_d, p3_d, alpha, p_cur, n)
241 p1_d, p2_d, p3_d, alpha, p_cur, n)
243 call neko_error(
'No device backend configured')
248 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
result(res)
249 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
250 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
257 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
260 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
262 call neko_error(
'No device backend configured')
265#ifndef HAVE_DEVICE_MPI
267 call mpi_allreduce(mpi_in_place, res, 1, &
276 rel_tol, abs_tol, monitor)
278 class(
pc_t),
optional,
intent(in),
target :: M
279 integer,
intent(in) :: n
280 integer,
intent(in) :: max_iter
281 real(kind=
rp),
optional,
intent(in) :: rel_tol
282 real(kind=
rp),
optional,
intent(in) :: abs_tol
283 logical,
optional,
intent(in) :: monitor
285 integer(c_size_t) :: p_size
299 allocate(this%tmp(n))
324 this%p1_d(i) = c_null_ptr
325 call device_map(this%p1(:,i), this%p1_d(i), n)
327 this%p2_d(i) = c_null_ptr
328 call device_map(this%p2(:,i), this%p2_d(i), n)
330 this%p3_d(i) = c_null_ptr
331 call device_map(this%p3(:,i), this%p3_d(i), n)
338 ptr = c_loc(this%p1_d)
341 ptr = c_loc(this%p2_d)
344 ptr = c_loc(this%p3_d)
347 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
348 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
349 else if (
present(rel_tol) .and.
present(abs_tol))
then
350 call this%ksp_init(max_iter, rel_tol, abs_tol)
351 else if (
present(monitor) .and.
present(abs_tol))
then
352 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
353 else if (
present(rel_tol) .and.
present(monitor))
then
354 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
355 else if (
present(rel_tol))
then
356 call this%ksp_init(max_iter, rel_tol = rel_tol)
357 else if (
present(abs_tol))
then
358 call this%ksp_init(max_iter, abs_tol = abs_tol)
359 else if (
present(monitor))
then
360 call this%ksp_init(max_iter, monitor = monitor)
362 call this%ksp_init(max_iter)
378 if (
allocated(this%w1))
then
382 if (
allocated(this%w2))
then
386 if (
allocated(this%w3))
then
390 if (
allocated(this%r1))
then
394 if (
allocated(this%r2))
then
398 if (
allocated(this%r3))
then
402 if (
allocated(this%z1))
then
406 if (
allocated(this%z2))
then
410 if (
allocated(this%z3))
then
414 if (
allocated(this%tmp))
then
418 if (
allocated(this%alpha))
then
419 deallocate(this%alpha)
422 if (
allocated(this%p1))
then
426 if (
allocated(this%p2))
then
430 if (
allocated(this%p3))
then
434 if (c_associated(this%w1_d))
then
438 if (c_associated(this%w2_d))
then
442 if (c_associated(this%w3_d))
then
446 if (c_associated(this%r1_d))
then
450 if (c_associated(this%r2_d))
then
454 if (c_associated(this%r3_d))
then
458 if (c_associated(this%z1_d))
then
462 if (c_associated(this%z2_d))
then
466 if (c_associated(this%z3_d))
then
470 if (c_associated(this%alpha_d))
then
474 if (c_associated(this%tmp_d))
then
478 if (
allocated(this%p1_d))
then
480 if (c_associated(this%p1_d(i)))
then
486 if (
allocated(this%p2_d))
then
488 if (c_associated(this%p2_d(i)))
then
494 if (
allocated(this%p3_d))
then
496 if (c_associated(this%p3_d(i)))
then
502 if (c_associated(this%p1_d_d))
then
506 if (c_associated(this%p2_d_d))
then
510 if (c_associated(this%p3_d_d))
then
516 if (c_associated(this%gs_event1))
then
520 if (c_associated(this%gs_event2))
then
524 if (c_associated(this%gs_event3))
then
532 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
534 class(
ax_t),
intent(in) :: ax
535 type(
field_t),
intent(inout) :: x
536 type(
field_t),
intent(inout) :: y
537 type(
field_t),
intent(inout) :: z
538 integer,
intent(in) :: n
539 real(kind=
rp),
dimension(n),
intent(in) :: fx
540 real(kind=
rp),
dimension(n),
intent(in) :: fy
541 real(kind=
rp),
dimension(n),
intent(in) :: fz
542 type(
coef_t),
intent(inout) :: coef
546 type(
gs_t),
intent(inout) :: gs_h
548 integer,
optional,
intent(in) :: niter
549 integer :: iter, max_iter, ierr, i, p_cur, p_prev
550 real(kind=
rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
551 real(kind=
rp) :: pap, beta
560 if (
present(niter))
then
565 norm_fac = 1.0_rp / sqrt(coef%volume)
567 associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
568 r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
569 p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
570 tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
571 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
572 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
573 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
574 p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
575 p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
593 r2_d, r3_d, tmp_d, n)
597 rnorm = sqrt(rtr)*norm_fac
598 ksp_results%res_start = rnorm
599 ksp_results%res_final = rnorm
600 ksp_results(1)%iter = 0
601 ksp_results(2:3)%iter = -1
602 if(
abscmp(rnorm, 0.0_rp))
then
603 ksp_results%converged = .true.
606 call this%monitor_start(
'fcpldCG')
607 do iter = 1, max_iter
608 call this%M%solve(z1, r1, n)
609 call this%M%solve(z2, r2, n)
610 call this%M%solve(z3, r3, n)
613 r1_d, r2_d, r3_d, tmp_d, n)
617 if (iter .eq. 1) beta = 0.0_rp
620 z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
622 call ax%compute_vector(w1, w2, w3, &
623 p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
626 call gs_h%op(w1, n, gs_op_add, this%gs_event1)
628 call blstx%apply(w1, n)
629 call gs_h%op(w2, n, gs_op_add, this%gs_event2)
631 call blsty%apply(w2, n)
632 call gs_h%op(w3, n, gs_op_add, this%gs_event3)
634 call blstz%apply(w3, n)
638 p2_d(p_cur), p3_d(p_cur), tmp_d, n)
642 alpha(p_cur) = rtz1 / pap
644 w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
645 rnorm = sqrt(rtr)*norm_fac
646 call this%monitor_iter(iter, rnorm)
648 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
650 p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
653 if (rnorm .lt. this%abs_tol)
exit
659 call this%monitor_stop()
660 ksp_results%res_final = rnorm
661 ksp_results%iter = iter
662 ksp_results%converged = this%is_converged(iter, rnorm)
670 gs_h, niter)
result(ksp_results)
672 class(
ax_t),
intent(in) :: ax
673 type(
field_t),
intent(inout) :: x
674 integer,
intent(in) :: n
675 real(kind=
rp),
dimension(n),
intent(in) :: f
676 type(
coef_t),
intent(inout) :: coef
678 type(
gs_t),
intent(inout) :: gs_h
680 integer,
optional,
intent(in) :: niter
683 call neko_error(
'The cpldcg solver is only defined for coupled solves')
685 ksp_results%res_final = 0.0
687 ksp_results%converged = .false.
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_cpld_update_x(void *x1, void *x2, void *x3, void *p1, void *p2, void *p3, void *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_update_p(void *p1, void *p2, void *p3, void *z1, void *z2, void *z3, void *po1, void *po2, void *po3, real *beta, int *n)
real hip_fusedcg_cpld_part2(void *a1, void *a2, void *a3, void *b, void *c1, void *c2, void *c3, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_part1(void *a1, void *a2, void *a3, void *b1, void *b2, void *b3, void *tmp, int *n)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
integer, public pe_size
MPI size of communicator.
type(mpi_comm), public neko_comm
MPI communicator.
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.