54 real(kind=
rp),
allocatable :: w(:)
55 real(kind=
rp),
allocatable :: r(:)
56 real(kind=
rp),
allocatable :: z(:)
57 real(kind=
rp),
allocatable :: p(:,:)
58 real(kind=
rp),
allocatable :: alpha(:)
59 type(c_ptr) :: w_d = c_null_ptr
60 type(c_ptr) :: r_d = c_null_ptr
61 type(c_ptr) :: z_d = c_null_ptr
62 type(c_ptr) :: alpha_d = c_null_ptr
63 type(c_ptr) :: p_d_d = c_null_ptr
64 type(c_ptr),
allocatable :: p_d(:)
65 type(c_ptr) :: gs_event = c_null_ptr
75 bind(c, name=
'cuda_fusedcg_update_p')
76 use,
intrinsic :: iso_c_binding
79 type(c_ptr),
value :: p_d, z_d, po_d
87 bind(c, name=
'cuda_fusedcg_update_x')
88 use,
intrinsic :: iso_c_binding
90 type(c_ptr),
value :: x_d, p_d, alpha
91 integer(c_int) :: p_cur, n
97 p_cur, n) bind(c, name=
'cuda_fusedcg_part2')
98 use,
intrinsic :: iso_c_binding
101 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
103 integer(c_int) :: n, p_cur
109 bind(c, name=
'hip_fusedcg_update_p')
110 use,
intrinsic :: iso_c_binding
113 type(c_ptr),
value :: p_d, z_d, po_d
121 bind(c, name=
'hip_fusedcg_update_x')
122 use,
intrinsic :: iso_c_binding
124 type(c_ptr),
value :: x_d, p_d, alpha
125 integer(c_int) :: p_cur, n
131 p_cur, n) bind(c, name=
'hip_fusedcg_part2')
132 use,
intrinsic :: iso_c_binding
135 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
137 integer(c_int) :: n, p_cur
145 type(c_ptr),
value :: p_d, z_d, po_d
153 call neko_error(
'No device backend configured')
158 type(c_ptr),
value :: x_d, p_d, alpha
159 integer(c_int) :: p_cur, n
165 call neko_error(
'No device backend configured')
170 p_cur, n)
result(res)
171 type(c_ptr),
value :: a_d, b_d, c_d, alpha_d
181 call neko_error(
'No device backend configured')
184 #ifndef HAVE_DEVICE_MPI
185 if (pe_size .gt. 1)
then
186 call mpi_allreduce(mpi_in_place, res, 1, &
187 mpi_real_precision, mpi_sum, neko_comm, ierr)
196 class(pc_t),
optional,
intent(inout),
target :: M
197 integer,
intent(in) :: n
198 integer,
intent(in) :: max_iter
199 real(kind=rp),
optional,
intent(inout) :: rel_tol
200 real(kind=rp),
optional,
intent(inout) :: abs_tol
202 integer(c_size_t) :: p_size
218 call device_map(this%w, this%w_d, n)
219 call device_map(this%r, this%r_d, n)
220 call device_map(this%z, this%z_d, n)
223 this%p_d(i) = c_null_ptr
224 call device_map(this%p(:,i), this%p_d(i), n)
228 call device_alloc(this%p_d_d, p_size)
229 ptr = c_loc(this%p_d)
230 call device_memcpy(ptr, this%p_d_d, p_size, &
231 host_to_device, sync=.false.)
232 if (
present(rel_tol) .and.
present(abs_tol))
then
233 call this%ksp_init(max_iter, rel_tol, abs_tol)
234 else if (
present(rel_tol))
then
235 call this%ksp_init(max_iter, rel_tol=rel_tol)
236 else if (
present(abs_tol))
then
237 call this%ksp_init(max_iter, abs_tol=abs_tol)
239 call this%ksp_init(max_iter)
242 call device_event_create(this%gs_event, 2)
253 if (
allocated(this%w))
then
257 if (
allocated(this%r))
then
262 if (
allocated(this%z))
then
267 if (
allocated(this%alpha))
then
268 deallocate(this%alpha)
271 if (
allocated(this%p))
then
275 if (c_associated(this%w_d))
then
276 call device_free(this%w_d)
279 if (c_associated(this%r_d))
then
280 call device_free(this%r_d)
283 if (c_associated(this%z_d))
then
284 call device_free(this%z_d)
287 if (c_associated(this%alpha_d))
then
288 call device_free(this%alpha_d)
291 if (
allocated(this%p_d))
then
293 if (c_associated(this%p_d(i)))
then
294 call device_free(this%p_d(i))
301 if (c_associated(this%gs_event))
then
302 call device_event_destroy(this%gs_event)
310 class(ax_t),
intent(inout) :: ax
311 type(field_t),
intent(inout) :: x
312 integer,
intent(in) :: n
313 real(kind=rp),
dimension(n),
intent(inout) :: f
314 type(coef_t),
intent(inout) :: coef
315 type(bc_list_t),
intent(inout) :: blst
316 type(gs_t),
intent(inout) :: gs_h
317 type(ksp_monitor_t) :: ksp_results
318 integer,
optional,
intent(in) :: niter
319 integer :: iter, max_iter, ierr, i, p_cur, p_prev
320 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
321 real(kind=rp) :: pap, beta
323 f_d = device_get_ptr(f)
325 if (
present(niter))
then
328 max_iter = ksp_max_iter
330 norm_fac = 1.0_rp / sqrt(coef%volume)
332 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
333 alpha => this%alpha, alpha_d => this%alpha_d, &
334 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
335 p_d => this%p_d, p_d_d => this%p_d_d)
340 call device_rzero(x%x_d, n)
341 call device_rzero(p_d(1), n)
342 call device_copy(r_d, f_d, n)
344 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
345 rnorm = sqrt(rtr)*norm_fac
346 ksp_results%res_start = rnorm
347 ksp_results%res_final = rnorm
349 if(abscmp(rnorm, 0.0_rp))
return
351 do iter = 1, max_iter
352 call this%M%solve(z, r, n)
354 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
356 if (iter .eq. 1) beta = 0.0_rp
359 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
360 call gs_h%op(w, n, gs_op_add, this%gs_event)
361 call device_event_sync(this%gs_event)
362 call bc_list_apply(blst, w, n)
364 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
366 alpha(p_cur) = rtz1 / pap
368 alpha_d, alpha(p_cur), p_cur, n)
369 rnorm = sqrt(rtr)*norm_fac
372 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter)
then
376 if (rnorm .lt. this%abs_tol)
exit
383 ksp_results%res_final = rnorm
384 ksp_results%iter = iter
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Defines a Matrix-vector product.
Defines a boundary condition.
subroutine, public device_rzero(a_d, n)
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
subroutine, public device_copy(a_d, b_d, n)
Device abstraction, common interface for various accelerators.
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol)
Initialise a fused PCG solver.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public ksp_max_iter
Maximum number of iters.
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
subroutine, public copy(a, b, n)
Copy a vector .
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of boundary conditions.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.