50 use,
intrinsic :: iso_c_binding, only : c_ptr, c_int, &
51 c_null_ptr, c_associated
57 real(kind=
rp),
allocatable :: d(:)
58 real(kind=
rp),
allocatable :: w(:)
59 real(kind=
rp),
allocatable :: r(:)
60 type(c_ptr) :: d_d = c_null_ptr
61 type(c_ptr) :: w_d = c_null_ptr
62 type(c_ptr) :: r_d = c_null_ptr
63 type(c_ptr) :: gs_event = c_null_ptr
64 real(kind=
rp) :: tha, dlt
65 integer :: power_its = 150
66 logical :: recompute_eigs = .true.
67 logical :: zero_initial_guess = .false.
79 bind(c, name =
'hip_cheby_part1')
80 use,
intrinsic :: iso_c_binding
83 type(c_ptr),
value :: d_d, x_d, strm
91 bind(c, name =
'hip_cheby_part2')
92 use,
intrinsic :: iso_c_binding
95 type(c_ptr),
value :: d_d, w_d, x_d, strm
96 real(c_rp) :: tmp1, tmp2
102 subroutine cuda_cheby_device_part1(d_d, x_d, inv_tha, n, strm) &
103 bind(c, name =
'cuda_cheby_part1')
104 use,
intrinsic :: iso_c_binding
107 type(c_ptr),
value :: d_d, x_d, strm
108 real(c_rp) :: inv_tha
110 end subroutine cuda_cheby_device_part1
114 subroutine cuda_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, strm) &
115 bind(c, name =
'cuda_cheby_part2')
116 use,
intrinsic :: iso_c_binding
119 type(c_ptr),
value :: d_d, w_d, x_d, strm
120 real(c_rp) :: tmp1, tmp2
122 end subroutine cuda_cheby_device_part2
126 subroutine metal_cheby_device_part1(d_d, x_d, inv_tha, n, strm) &
127 bind(c, name =
'metal_cheby_part1')
128 use,
intrinsic :: iso_c_binding
131 type(c_ptr),
value :: d_d, x_d, strm
132 real(c_rp) :: inv_tha
134 end subroutine metal_cheby_device_part1
138 subroutine metal_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, strm) &
139 bind(c, name =
'metal_cheby_part2')
140 use,
intrinsic :: iso_c_binding
143 type(c_ptr),
value :: d_d, w_d, x_d, strm
144 real(c_rp) :: tmp1, tmp2
146 end subroutine metal_cheby_device_part2
152 type(c_ptr) :: d_d, x_d
153 real(c_rp) :: inv_tha
158 call cuda_cheby_device_part1(d_d, x_d, inv_tha, n,
glb_cmd_queue)
160 call metal_cheby_device_part1(d_d, x_d, inv_tha, n,
glb_cmd_queue)
161#else !Fallback to device_math for missing device kernels
170 type(c_ptr) :: d_d, w_d, x_d
171 real(c_rp) :: tmp1, tmp2
176 call cuda_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n,
glb_cmd_queue)
178 call metal_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n,
glb_cmd_queue)
179#else !Fallback to device_math for missing device kernels
189 integer,
intent(in) :: max_iter
190 class(
pc_t),
optional,
intent(in),
target :: M
191 integer,
intent(in) :: n
192 real(kind=
rp),
optional,
intent(in) :: rel_tol
193 real(kind=
rp),
optional,
intent(in) :: abs_tol
194 logical,
optional,
intent(in) :: monitor
209 if (
present(rel_tol) .and.
present(abs_tol) .and.
present(monitor))
then
210 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
211 else if (
present(rel_tol) .and.
present(abs_tol))
then
212 call this%ksp_init(max_iter, rel_tol, abs_tol)
213 else if (
present(monitor) .and.
present(abs_tol))
then
214 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
215 else if (
present(rel_tol) .and.
present(monitor))
then
216 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
217 else if (
present(rel_tol))
then
218 call this%ksp_init(max_iter, rel_tol = rel_tol)
219 else if (
present(abs_tol))
then
220 call this%ksp_init(max_iter, abs_tol = abs_tol)
221 else if (
present(monitor))
then
222 call this%ksp_init(max_iter, monitor = monitor)
224 call this%ksp_init(max_iter)
236 if (
allocated(this%d))
then
237 if (c_associated(this%d_d))
then
243 if (
allocated(this%w))
then
244 if (c_associated(this%w_d))
then
250 if (
allocated(this%r))
then
251 if (c_associated(this%r_d))
then
259 if (c_associated(this%gs_event))
then
267 class(
ax_t),
intent(in) :: Ax
268 type(
field_t),
intent(inout) :: x
269 integer,
intent(in) :: n
270 type(
coef_t),
intent(inout) :: coef
272 type(
gs_t),
intent(inout) :: gs_h
273 real(kind=
rp) :: lam, b, a, rn
274 real(kind=
rp) :: boost = 1.1_rp
275 real(kind=
rp) :: lam_factor = 30.0_rp
276 real(kind=
rp) :: wtw, dtw, dtd
279 associate(w => this%w, w_d => this%w_d, d => this%d, d_d => this%d_d)
283 call random_number(rn)
288 call gs_h%op(d, n, gs_op_add, this%gs_event)
289 call blst%apply(d, n)
292 do i = 1, this%power_its
293 call ax%compute(w, d, coef, x%msh, x%Xh)
294 call gs_h%op(w, n, gs_op_add, this%gs_event)
295 call blst%apply(w, n)
296 if (
associated(this%schwarz))
then
297 call this%schwarz%compute(this%r, w)
300 call this%M%solve(this%r, w, n)
306 call blst%apply(d, n)
309 call ax%compute(w, d, coef, x%msh, x%Xh)
310 call gs_h%op(w, n, gs_op_add, this%gs_event)
311 call blst%apply(w, n)
312 if (
associated(this%schwarz))
then
313 call this%schwarz%compute(this%r, w)
316 call this%M%solve(this%r, w, n)
325 this%tha = (b+a)/2.0_rp
326 this%dlt = (b-a)/2.0_rp
328 this%recompute_eigs = .false.
336 class(ax_t),
intent(in) :: ax
337 type(field_t),
intent(inout) :: x
338 integer,
intent(in) :: n
339 real(kind=rp),
dimension(n),
intent(in) :: f
340 type(coef_t),
intent(inout) :: coef
341 type(bc_list_t),
intent(inout) :: blst
342 type(gs_t),
intent(inout) :: gs_h
343 type(ksp_monitor_t) :: ksp_results
344 integer,
optional,
intent(in) :: niter
345 integer :: iter, max_iter
346 real(kind=rp) :: a, b, rtr, rnorm, norm_fac
349 f_d = device_get_ptr(f)
351 if (this%recompute_eigs)
then
355 if (
present(niter))
then
358 max_iter = this%max_iter
360 norm_fac = 1.0_rp / sqrt(coef%volume)
362 associate( w => this%w, r => this%r, d => this%d, &
363 w_d => this%w_d, r_d => this%r_d, d_d => this%d_d)
365 call device_copy(r_d, f_d, n)
366 call ax%compute(w, x%x, coef, x%msh, x%Xh)
367 call gs_h%op(w, n, gs_op_add, this%gs_event)
368 call blst%apply(w, n)
369 call device_sub2(r_d, w_d, n)
371 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
372 rnorm = sqrt(rtr) * norm_fac
373 ksp_results%res_start = rnorm
374 ksp_results%res_final = rnorm
378 call this%M%solve(w, r, n)
379 call device_copy(d_d, w_d, n)
380 a = 2.0_rp / this%tha
381 call device_add2s2(x%x_d, d_d, a, n)
384 do iter = 2, max_iter
386 call device_copy(r_d, f_d, n)
387 call ax%compute(w, x%x, coef, x%msh, x%Xh)
388 call gs_h%op(w, n, gs_op_add, this%gs_event)
389 call blst%apply(w, n)
390 call device_sub2(r_d, w_d, n)
392 call this%M%solve(w, r, n)
394 if (iter .eq. 2)
then
395 b = 0.5_rp * (this%dlt * a)**2
397 b = (this%dlt * a / 2.0_rp)**2
399 a = 1.0_rp/(this%tha - b/a)
400 call device_add2s1(d_d, w_d, b, n)
402 call device_add2s2(x%x_d, d_d, a, n)
406 call device_copy(r_d, f_d, n)
407 call ax%compute(w, x%x, coef, x%msh, x%Xh)
408 call gs_h%op(w, n, gs_op_add, this%gs_event)
409 call blst%apply(w, n)
410 call device_sub2(r_d, w_d, n)
411 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
412 rnorm = sqrt(rtr) * norm_fac
415 ksp_results%res_final = rnorm
416 ksp_results%iter = iter
417 ksp_results%converged = this%is_converged(iter, rnorm)
425 class(ax_t),
intent(in) :: ax
426 type(field_t),
intent(inout) :: x
427 integer,
intent(in) :: n
428 real(kind=rp),
dimension(n),
intent(in) :: f
429 type(coef_t),
intent(inout) :: coef
430 type(bc_list_t),
intent(inout) :: blst
431 type(gs_t),
intent(inout) :: gs_h
432 type(ksp_monitor_t) :: ksp_results
433 integer,
optional,
intent(in) :: niter
434 integer :: iter, max_iter
435 real(kind=rp) :: a, b, rtr, rnorm, norm_fac
436 real(kind=rp) :: rhok, rhokp1, sig1, tmp1, tmp2
439 f_d = device_get_ptr(f)
441 if (this%recompute_eigs)
then
445 if (
present(niter))
then
448 max_iter = this%max_iter
450 norm_fac = 1.0_rp / sqrt(coef%volume)
452 associate( w => this%w, r => this%r, d => this%d, &
453 w_d => this%w_d, r_d => this%r_d, d_d => this%d_d)
455 if (.not.this%zero_initial_guess)
then
456 call ax%compute(w, x%x, coef, x%msh, x%Xh)
457 call gs_h%op(w, n, gs_op_add, this%gs_event)
458 call blst%apply(w, n)
459 call device_sub3(r_d, f_d, w_d, n)
461 call device_copy(r_d, f_d, n)
462 this%zero_initial_guess = .false.
466 if (
associated(this%schwarz))
then
467 call this%schwarz%compute(d, r)
469 call this%M%solve(d, r, n)
472 tmp1 = 1.0_rp / this%tha
475 sig1 = this%tha / this%dlt
479 do iter = 2, max_iter
480 rhokp1 = 1.0_rp / (2.0_rp * sig1 - rhok)
482 tmp2 = 2.0_rp * rhokp1 / this%dlt
485 call ax%compute(w, x%x, coef, x%msh, x%Xh)
486 call gs_h%op(w, n, gs_op_add, this%gs_event)
487 call blst%apply(w, n)
488 call device_sub3(r_d, f_d, w_d, n)
490 if (
associated(this%schwarz))
then
491 call this%schwarz%compute(w, r)
493 call this%M%solve(w, r, n)
505 n, coef, blstx, blsty, blstz, gs_h, niter)
result(ksp_results)
507 class(ax_t),
intent(in) :: ax
508 type(field_t),
intent(inout) :: x
509 type(field_t),
intent(inout) :: y
510 type(field_t),
intent(inout) :: z
511 integer,
intent(in) :: n
512 real(kind=rp),
dimension(n),
intent(in) :: fx
513 real(kind=rp),
dimension(n),
intent(in) :: fy
514 real(kind=rp),
dimension(n),
intent(in) :: fz
515 type(coef_t),
intent(inout) :: coef
516 type(bc_list_t),
intent(inout) :: blstx
517 type(bc_list_t),
intent(inout) :: blsty
518 type(bc_list_t),
intent(inout) :: blstz
519 type(gs_t),
intent(inout) :: gs_h
520 type(ksp_monitor_t),
dimension(3) :: ksp_results
521 integer,
optional,
intent(in) :: niter
523 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
524 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
525 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
Map a Fortran array to a device (allocate and associate)
Copy data between host and device (or device and device)
Unmap a Fortran array from a device (deassociate and free)
Defines a Matrix-vector product.
Chebyshev preconditioner.
subroutine cheby_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a standard solver.
subroutine cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n)
type(ksp_monitor_t) function, dimension(3) cheby_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Standard Cheby_Deviceshev coupled solve.
type(ksp_monitor_t) function cheby_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
A chebyshev preconditioner.
subroutine cheby_device_free(this)
subroutine cheby_device_part1(d_d, x_d, inv_tha, n)
subroutine cheby_device_power(this, ax, x, n, coef, blst, gs_h)
type(ksp_monitor_t) function cheby_device_impl(this, ax, x, f, n, coef, blst, gs_h, niter)
A chebyshev preconditioner.
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
Device abstraction, common interface for various accelerators.
integer, parameter, public host_to_device
subroutine, public device_event_destroy(event)
Destroy a device event.
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
subroutine, public device_event_create(event, flags)
Create a device event queue.
Implements the base abstract type for Krylov solvers plus helper types.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Overlapping schwarz solves.
Defines a function space.
Base type for a matrix-vector product providing .
A list of allocatable `bc_t`. Follows the standard interface of lists.
Defines a Chebyshev preconditioner.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.
The function space for the SEM solution fields.