54 real(kind=
rp),
allocatable :: w1(:)
55 real(kind=
rp),
allocatable :: w2(:)
56 real(kind=
rp),
allocatable :: w3(:)
57 real(kind=
rp),
allocatable :: r1(:)
58 real(kind=
rp),
allocatable :: r2(:)
59 real(kind=
rp),
allocatable :: r3(:)
60 real(kind=
rp),
allocatable :: z1(:)
61 real(kind=
rp),
allocatable :: z2(:)
62 real(kind=
rp),
allocatable :: z3(:)
63 real(kind=
rp),
allocatable :: tmp(:)
64 real(kind=
rp),
allocatable :: p1(:,:)
65 real(kind=
rp),
allocatable :: p2(:,:)
66 real(kind=
rp),
allocatable :: p3(:,:)
67 real(kind=
rp),
allocatable :: alpha(:)
68 type(c_ptr) :: w1_d = c_null_ptr
69 type(c_ptr) :: w2_d = c_null_ptr
70 type(c_ptr) :: w3_d = c_null_ptr
71 type(c_ptr) :: r1_d = c_null_ptr
72 type(c_ptr) :: r2_d = c_null_ptr
73 type(c_ptr) :: r3_d = c_null_ptr
74 type(c_ptr) :: z1_d = c_null_ptr
75 type(c_ptr) :: z2_d = c_null_ptr
76 type(c_ptr) :: z3_d = c_null_ptr
77 type(c_ptr) :: alpha_d = c_null_ptr
78 type(c_ptr) :: p1_d_d = c_null_ptr
79 type(c_ptr) :: p2_d_d = c_null_ptr
80 type(c_ptr) :: p3_d_d = c_null_ptr
81 type(c_ptr) :: tmp_d = c_null_ptr
82 type(c_ptr),
allocatable :: p1_d(:)
83 type(c_ptr),
allocatable :: p2_d(:)
84 type(c_ptr),
allocatable :: p3_d(:)
85 type(c_ptr) :: gs_event1 = c_null_ptr
86 type(c_ptr) :: gs_event2 = c_null_ptr
87 type(c_ptr) :: gs_event3 = c_null_ptr
98 b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
99 use,
intrinsic :: iso_c_binding
102 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
109 po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
110 use,
intrinsic :: iso_c_binding
113 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
114 type(c_ptr),
value :: po1_d, po2_d, po3_d
122 alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
123 use,
intrinsic :: iso_c_binding
125 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
126 integer(c_int) :: p_cur, n
132 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
133 bind(c, name=
'cuda_fusedcg_cpld_part2')
134 use,
intrinsic :: iso_c_binding
137 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
138 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
140 integer(c_int) :: n, p_cur
149 b1_d, b2_d, b3_d, tmp_d, n)
150 type(c_ptr),
value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
151 type(c_ptr),
value :: tmp_d
157 call neko_error(
'No device backend configured')
162 po1_d, po2_d, po3_d, beta, n)
163 type(c_ptr),
value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
164 type(c_ptr),
value :: po1_d, po2_d, po3_d
170 po1_d, po2_d, po3_d, beta, n)
172 call neko_error(
'No device backend configured')
177 p1_d, p2_d, p3_d, alpha, p_cur, n)
178 type(c_ptr),
value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
179 integer(c_int) :: p_cur, n
183 p1_d, p2_d, p3_d, alpha, p_cur, n)
185 call neko_error(
'No device backend configured')
190 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
result(res)
191 type(c_ptr),
value :: a1_d, a2_d, a3_d, b_d
192 type(c_ptr),
value :: c1_d, c2_d, c3_d, alpha_d
200 c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
202 call neko_error(
'No device backend configured')
205 #ifndef HAVE_DEVICE_MPI
207 call mpi_allreduce(mpi_in_place, res, 1, &
216 rel_tol, abs_tol, monitor)
218 class(
pc_t),
optional,
intent(inout),
target :: M
219 integer,
intent(in) :: n
220 integer,
intent(in) :: max_iter
221 real(kind=
rp),
optional,
intent(inout) :: rel_tol
222 real(kind=
rp),
optional,
intent(inout) :: abs_tol
223 logical,
optional,
intent(in) :: monitor
225 integer(c_size_t) :: p_size
239 allocate(this%tmp(n))
264 this%p1_d(i) = c_null_ptr
265 call device_map(this%p1(:,i), this%p1_d(i), n)
267 this%p2_d(i) = c_null_ptr
268 call device_map(this%p2(:,i), this%p2_d(i), n)
270 this%p3_d(i) = c_null_ptr
271 call device_map(this%p3(:,i), this%p3_d(i), n)
278 ptr = c_loc(this%p1_d)
281 ptr = c_loc(this%p2_d)
284 ptr = c_loc(this%p3_d)
287 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
288 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
289 else if (
present(rel_tol) .and.
present(abs_tol))
then
290 call this%ksp_init(max_iter, rel_tol, abs_tol)
291 else if (
present(monitor) .and.
present(abs_tol))
then
292 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
293 else if (
present(rel_tol) .and.
present(monitor))
then
294 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
295 else if (
present(rel_tol))
then
296 call this%ksp_init(max_iter, rel_tol = rel_tol)
297 else if (
present(abs_tol))
then
298 call this%ksp_init(max_iter, abs_tol = abs_tol)
299 else if (
present(monitor))
then
300 call this%ksp_init(max_iter, monitor = monitor)
302 call this%ksp_init(max_iter)
318 if (
allocated(this%w1))
then
322 if (
allocated(this%w2))
then
326 if (
allocated(this%w3))
then
330 if (
allocated(this%r1))
then
334 if (
allocated(this%r2))
then
338 if (
allocated(this%r3))
then
342 if (
allocated(this%z1))
then
346 if (
allocated(this%z2))
then
350 if (
allocated(this%z3))
then
354 if (
allocated(this%tmp))
then
358 if (
allocated(this%alpha))
then
359 deallocate(this%alpha)
362 if (
allocated(this%p1))
then
366 if (
allocated(this%p2))
then
370 if (
allocated(this%p3))
then
374 if (c_associated(this%w1_d))
then
378 if (c_associated(this%w2_d))
then
382 if (c_associated(this%w3_d))
then
386 if (c_associated(this%r1_d))
then
390 if (c_associated(this%r2_d))
then
394 if (c_associated(this%r3_d))
then
398 if (c_associated(this%z1_d))
then
402 if (c_associated(this%z2_d))
then
406 if (c_associated(this%z3_d))
then
410 if (c_associated(this%alpha_d))
then
414 if (c_associated(this%tmp_d))
then
418 if (
allocated(this%p1_d))
then
420 if (c_associated(this%p1_d(i)))
then
426 if (
allocated(this%p2_d))
then
428 if (c_associated(this%p2_d(i)))
then
434 if (
allocated(this%p3_d))
then
436 if (c_associated(this%p3_d(i)))
then
444 if (c_associated(this%gs_event1))
then
448 if (c_associated(this%gs_event2))
then
452 if (c_associated(this%gs_event3))
then
460 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
462 class(
ax_t),
intent(inout) :: ax
463 type(
field_t),
intent(inout) :: x
464 type(
field_t),
intent(inout) :: y
465 type(
field_t),
intent(inout) :: z
466 integer,
intent(in) :: n
467 real(kind=
rp),
dimension(n),
intent(inout) :: fx
468 real(kind=
rp),
dimension(n),
intent(inout) :: fy
469 real(kind=
rp),
dimension(n),
intent(inout) :: fz
470 type(
coef_t),
intent(inout) :: coef
474 type(
gs_t),
intent(inout) :: gs_h
476 integer,
optional,
intent(in) :: niter
477 integer :: iter, max_iter, ierr, i, p_cur, p_prev
478 real(kind=
rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
479 real(kind=
rp) :: pap, beta
488 if (
present(niter))
then
493 norm_fac = 1.0_rp / sqrt(coef%volume)
495 associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
496 r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
497 p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
498 tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
499 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
500 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
501 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
502 p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
503 p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
521 r2_d, r3_d, tmp_d, n)
525 rnorm = sqrt(rtr)*norm_fac
526 ksp_results%res_start = rnorm
527 ksp_results%res_final = rnorm
528 ksp_results(1)%iter = 0
529 ksp_results(2:3)%iter = -1
530 if(
abscmp(rnorm, 0.0_rp))
return
531 call this%monitor_start(
'fcpldCG')
532 do iter = 1, max_iter
533 call this%M%solve(z1, r1, n)
534 call this%M%solve(z2, r2, n)
535 call this%M%solve(z3, r3, n)
539 r1_d, r2_d, r3_d, tmp_d, n)
543 if (iter .eq. 1) beta = 0.0_rp
546 z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
548 call ax%compute_vector(w1, w2, w3, &
549 p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
550 call gs_h%op(w1, n, gs_op_add, this%gs_event1)
551 call gs_h%op(w2, n, gs_op_add, this%gs_event2)
552 call gs_h%op(w3, n, gs_op_add, this%gs_event3)
561 p2_d(p_cur), p3_d(p_cur), tmp_d, n)
565 alpha(p_cur) = rtz1 / pap
567 w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
568 rnorm = sqrt(rtr)*norm_fac
569 call this%monitor_iter(iter, rnorm)
571 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
573 p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
576 if (rnorm .lt. this%abs_tol)
exit
582 call this%monitor_stop()
583 ksp_results%res_final = rnorm
584 ksp_results%iter = iter
592 gs_h, niter)
result(ksp_results)
594 class(
ax_t),
intent(inout) :: ax
595 type(
field_t),
intent(inout) :: x
596 integer,
intent(in) :: n
597 real(kind=
rp),
dimension(n),
intent(inout) :: f
598 type(
coef_t),
intent(inout) :: coef
600 type(
gs_t),
intent(inout) :: gs_h
602 integer,
optional,
intent(in) :: niter
605 call neko_error(
'Only defined for coupled solves')
607 ksp_results%res_final = 0.0
Return the device pointer for an associated Fortran array.
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Defines a Matrix-vector product.
Defines a boundary condition.
type(mpi_comm) neko_comm
MPI communicator.
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
integer pe_size
MPI size of communicator.
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc2(a_d, b_d, n)
Weighted inner product .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Device abstraction, common interface for various accelerators.
subroutine, public device_event_sync(event)
Synchronize an event.
integer, parameter, public host_to_device
subroutine, public device_free(x_d)
Deallocate memory on the device.
subroutine, public device_event_destroy(event)
Destroy a device event.
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, M, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of boundary conditions.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.