49 use,
intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
50 c_associated, c_size_t, c_sizeof, c_int, c_loc
58 real(kind=
rp),
allocatable :: w1(:)
59 real(kind=
rp),
allocatable :: w2(:)
60 real(kind=
rp),
allocatable :: w3(:)
61 real(kind=
rp),
allocatable :: r1(:)
62 real(kind=
rp),
allocatable :: r2(:)
63 real(kind=
rp),
allocatable :: r3(:)
64 real(kind=
rp),
allocatable :: z1(:)
65 real(kind=
rp),
allocatable :: z2(:)
66 real(kind=
rp),
allocatable :: z3(:)
67 real(kind=
rp),
allocatable :: tmp(:)
68 real(kind=
rp),
allocatable :: p1(:,:)
69 real(kind=
rp),
allocatable :: p2(:,:)
70 real(kind=
rp),
allocatable :: p3(:,:)
71 real(kind=
rp),
allocatable :: alpha(:)
72 type(c_ptr) :: w1_d = c_null_ptr
73 type(c_ptr) :: w2_d = c_null_ptr
74 type(c_ptr) :: w3_d = c_null_ptr
75 type(c_ptr) :: r1_d = c_null_ptr
76 type(c_ptr) :: r2_d = c_null_ptr
77 type(c_ptr) :: r3_d = c_null_ptr
78 type(c_ptr) :: z1_d = c_null_ptr
79 type(c_ptr) :: z2_d = c_null_ptr
80 type(c_ptr) :: z3_d = c_null_ptr
81 type(c_ptr) :: alpha_d = c_null_ptr
82 type(c_ptr) :: p1_d_d = c_null_ptr
83 type(c_ptr) :: p2_d_d = c_null_ptr
84 type(c_ptr) :: p3_d_d = c_null_ptr
85 type(c_ptr) :: tmp_d = c_null_ptr
86 type(c_ptr),
allocatable :: p1_d(:)
87 type(c_ptr),
allocatable :: p2_d(:)
88 type(c_ptr),
allocatable :: p3_d(:)
89 type(c_ptr) :: gs_event1 = c_null_ptr
90 type(c_ptr) :: gs_event2 = c_null_ptr
91 type(c_ptr) :: gs_event3 = c_null_ptr
102 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
103 use,
intrinsic :: iso_c_binding
106 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
113 po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
114 use,
intrinsic :: iso_c_binding
117 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
118 type(c_ptr),
value :: po1_d, po2_d, po3_d
126 alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
127 use,
intrinsic :: iso_c_binding
129 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
130 integer(c_int) :: p_cur, n
136 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
137 bind(c, name=
'cuda_fusedcg_cpld_part2')
138 use,
intrinsic :: iso_c_binding
141 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
142 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
144 integer(c_int) :: n, p_cur
150 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='hip_fusedcg_cpld_part1')
151 use,
intrinsic :: iso_c_binding
154 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
161 po1_d, po2_d, po3_d, beta, n) bind(c, name='hip_fusedcg_cpld_update_p')
162 use,
intrinsic :: iso_c_binding
165 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
166 type(c_ptr),
value :: po1_d, po2_d, po3_d
174 alpha, p_cur, n) bind(c, name='hip_fusedcg_cpld_update_x')
175 use,
intrinsic :: iso_c_binding
177 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
178 integer(c_int) :: p_cur, n
184 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
185 bind(c, name=
'hip_fusedcg_cpld_part2')
186 use,
intrinsic :: iso_c_binding
189 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
190 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
192 integer(c_int) :: n, p_cur
200 b1_d, b2_d, b3_d, tmp_d, n)
201 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
202 type(c_ptr),
value :: tmp_d
209 call neko_error(
'No device backend configured')
214 po1_d, po2_d, po3_d, beta, n)
215 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
216 type(c_ptr),
value :: po1_d, po2_d, po3_d
221 po1_d, po2_d, po3_d, beta, n)
224 po1_d, po2_d, po3_d, beta, n)
226 call neko_error(
'No device backend configured')
231 p1_d, p2_d, p3_d, alpha, p_cur, n)
232 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
233 integer(c_int) :: p_cur, n
236 p1_d, p2_d, p3_d, alpha, p_cur, n)
239 p1_d, p2_d, p3_d, alpha, p_cur, n)
241 call neko_error(
'No device backend configured')
246 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
result(res)
247 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
248 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
255 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
258 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
260 call neko_error(
'No device backend configured')
263#ifndef HAVE_DEVICE_MPI
265 call mpi_allreduce(mpi_in_place, res, 1, &
274 rel_tol, abs_tol, monitor)
276 class(
pc_t),
optional,
intent(in),
target :: M
277 integer,
intent(in) :: n
278 integer,
intent(in) :: max_iter
279 real(kind=
rp),
optional,
intent(in) :: rel_tol
280 real(kind=
rp),
optional,
intent(in) :: abs_tol
281 logical,
optional,
intent(in) :: monitor
283 integer(c_size_t) :: p_size
297 allocate(this%tmp(n))
322 this%p1_d(i) = c_null_ptr
323 call device_map(this%p1(:,i), this%p1_d(i), n)
325 this%p2_d(i) = c_null_ptr
326 call device_map(this%p2(:,i), this%p2_d(i), n)
328 this%p3_d(i) = c_null_ptr
329 call device_map(this%p3(:,i), this%p3_d(i), n)
336 ptr = c_loc(this%p1_d)
339 ptr = c_loc(this%p2_d)
342 ptr = c_loc(this%p3_d)
345 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
346 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
347 else if (
present(rel_tol) .and.
present(abs_tol))
then
348 call this%ksp_init(max_iter, rel_tol, abs_tol)
349 else if (
present(monitor) .and.
present(abs_tol))
then
350 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
351 else if (
present(rel_tol) .and.
present(monitor))
then
352 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
353 else if (
present(rel_tol))
then
354 call this%ksp_init(max_iter, rel_tol = rel_tol)
355 else if (
present(abs_tol))
then
356 call this%ksp_init(max_iter, abs_tol = abs_tol)
357 else if (
present(monitor))
then
358 call this%ksp_init(max_iter, monitor = monitor)
360 call this%ksp_init(max_iter)
376 if (
allocated(this%w1))
then
380 if (
allocated(this%w2))
then
384 if (
allocated(this%w3))
then
388 if (
allocated(this%r1))
then
392 if (
allocated(this%r2))
then
396 if (
allocated(this%r3))
then
400 if (
allocated(this%z1))
then
404 if (
allocated(this%z2))
then
408 if (
allocated(this%z3))
then
412 if (
allocated(this%tmp))
then
416 if (
allocated(this%alpha))
then
417 deallocate(this%alpha)
420 if (
allocated(this%p1))
then
424 if (
allocated(this%p2))
then
428 if (
allocated(this%p3))
then
432 if (c_associated(this%w1_d))
then
436 if (c_associated(this%w2_d))
then
440 if (c_associated(this%w3_d))
then
444 if (c_associated(this%r1_d))
then
448 if (c_associated(this%r2_d))
then
452 if (c_associated(this%r3_d))
then
456 if (c_associated(this%z1_d))
then
460 if (c_associated(this%z2_d))
then
464 if (c_associated(this%z3_d))
then
468 if (c_associated(this%alpha_d))
then
472 if (c_associated(this%tmp_d))
then
476 if (
allocated(this%p1_d))
then
478 if (c_associated(this%p1_d(i)))
then
484 if (
allocated(this%p2_d))
then
486 if (c_associated(this%p2_d(i)))
then
492 if (
allocated(this%p3_d))
then
494 if (c_associated(this%p3_d(i)))
then
502 if (c_associated(this%gs_event1))
then
506 if (c_associated(this%gs_event2))
then
510 if (c_associated(this%gs_event3))
then
518 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
520 class(
ax_t),
intent(in) :: ax
521 type(
field_t),
intent(inout) :: x
522 type(
field_t),
intent(inout) :: y
523 type(
field_t),
intent(inout) :: z
524 integer,
intent(in) :: n
525 real(kind=
rp),
dimension(n),
intent(in) :: fx
526 real(kind=
rp),
dimension(n),
intent(in) :: fy
527 real(kind=
rp),
dimension(n),
intent(in) :: fz
528 type(
coef_t),
intent(inout) :: coef
532 type(
gs_t),
intent(inout) :: gs_h
534 integer,
optional,
intent(in) :: niter
535 integer :: iter, max_iter, ierr, i, p_cur, p_prev
536 real(kind=
rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
537 real(kind=
rp) :: pap, beta
546 if (
present(niter))
then
551 norm_fac = 1.0_rp / sqrt(coef%volume)
553 associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
554 r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
555 p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
556 tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
557 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
558 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
559 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
560 p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
561 p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
579 r2_d, r3_d, tmp_d, n)
583 rnorm = sqrt(rtr)*norm_fac
584 ksp_results%res_start = rnorm
585 ksp_results%res_final = rnorm
586 ksp_results(1)%iter = 0
587 ksp_results(2:3)%iter = -1
588 if(
abscmp(rnorm, 0.0_rp))
return
589 call this%monitor_start(
'fcpldCG')
590 do iter = 1, max_iter
591 call this%M%solve(z1, r1, n)
592 call this%M%solve(z2, r2, n)
593 call this%M%solve(z3, r3, n)
596 r1_d, r2_d, r3_d, tmp_d, n)
600 if (iter .eq. 1) beta = 0.0_rp
603 z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
605 call ax%compute_vector(w1, w2, w3, &
606 p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
607 call gs_h%op(w1, n, gs_op_add, this%gs_event1)
608 call gs_h%op(w2, n, gs_op_add, this%gs_event2)
609 call gs_h%op(w3, n, gs_op_add, this%gs_event3)
613 call blstx%apply(w1, n)
614 call blsty%apply(w2, n)
615 call blstz%apply(w3, n)
618 p2_d(p_cur), p3_d(p_cur), tmp_d, n)
622 alpha(p_cur) = rtz1 / pap
624 w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
625 rnorm = sqrt(rtr)*norm_fac
626 call this%monitor_iter(iter, rnorm)
628 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
630 p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
633 if (rnorm .lt. this%abs_tol)
exit
639 call this%monitor_stop()
640 ksp_results%res_final = rnorm
641 ksp_results%iter = iter
642 ksp_results%converged = this%is_converged(iter, rnorm)
650 gs_h, niter)
result(ksp_results)
652 class(
ax_t),
intent(in) :: ax
653 type(
field_t),
intent(inout) :: x
654 integer,
intent(in) :: n
655 real(kind=
rp),
dimension(n),
intent(in) :: f
656 type(
coef_t),
intent(inout) :: coef
658 type(
gs_t),
intent(inout) :: gs_h
660 integer,
optional,
intent(in) :: niter
663 call neko_error(
'The cpldcg solver is only defined for coupled solves')
665 ksp_results%res_final = 0.0
667 ksp_results%converged = .false.
void hip_fusedcg_cpld_update_x(void *x1, void *x2, void *x3, void *p1, void *p2, void *p3, void *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_update_p(void *p1, void *p2, void *p3, void *z1, void *z2, void *z3, void *po1, void *po2, void *po3, real *beta, int *n)
real hip_fusedcg_cpld_part2(void *a1, void *a2, void *a3, void *b, void *c1, void *c2, void *c3, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_cpld_part1(void *a1, void *a2, void *a3, void *b1, void *b2, void *b3, void *tmp, int *n)
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
type(mpi_comm) neko_comm
MPI communicator.
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
integer pe_size
MPI size of communicator.
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc2(a_d, b_d, n)
Weighted inner product .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.