51 use,
intrinsic :: iso_c_binding
58 real(kind=
rp),
allocatable :: w(:)
59 real(kind=
rp),
allocatable :: c(:)
60 real(kind=
rp),
allocatable :: r(:)
61 real(kind=
rp),
allocatable :: z(:,:)
62 real(kind=
rp),
allocatable :: h(:,:)
63 real(kind=
rp),
allocatable :: v(:,:)
64 real(kind=
rp),
allocatable :: s(:)
65 real(kind=
rp),
allocatable :: gam(:)
66 type(c_ptr) :: w_d = c_null_ptr
67 type(c_ptr) :: c_d = c_null_ptr
68 type(c_ptr) :: r_d = c_null_ptr
69 type(c_ptr) :: s_d = c_null_ptr
70 type(c_ptr) :: gam_d = c_null_ptr
71 type(c_ptr),
allocatable :: z_d(:), h_d(:), v_d(:)
72 type(c_ptr) :: z_d_d = c_null_ptr
73 type(c_ptr) :: h_d_d = c_null_ptr
74 type(c_ptr) :: v_d_d = c_null_ptr
75 type(c_ptr) :: gs_event = c_null_ptr
85 bind(c, name=
'hip_gmres_part2')
86 use,
intrinsic :: iso_c_binding
89 type(c_ptr),
value :: h_d, w_d, v_d_d, mult_d
90 integer(c_int) :: j, n
97 bind(c, name=
'cuda_gmres_part2')
98 use,
intrinsic :: iso_c_binding
101 type(c_ptr),
value :: h_d, w_d, v_d_d, mult_d
102 integer(c_int) :: j, n
110 type(c_ptr),
value :: h_d, w_d, v_d_d, mult_d
111 integer(c_int) :: j, n
119 call neko_error(
'No device backend configured')
122 #ifndef HAVE_DEVICE_MPI
123 if (pe_size .gt. 1)
then
124 call mpi_allreduce(mpi_in_place, alpha, 1, &
125 mpi_real_precision, mpi_sum, neko_comm, ierr)
134 integer,
intent(in) :: n
135 integer,
intent(in) :: max_iter
136 class(pc_t),
optional,
intent(inout),
target :: M
137 integer,
optional,
intent(inout) :: m_restart
138 real(kind=rp),
optional,
intent(inout) :: rel_tol
139 real(kind=rp),
optional,
intent(inout) :: abs_tol
140 type(device_ident_t),
target :: m_ident
142 integer(c_size_t) :: z_size
145 if (
present(m_restart))
then
146 this%m_restart = m_restart
162 call device_map(this%w, this%w_d, n)
163 call device_map(this%r, this%r_d, n)
165 allocate(this%c(this%m_restart))
166 allocate(this%s(this%m_restart))
167 allocate(this%gam(this%m_restart + 1))
168 call device_map(this%c, this%c_d, this%m_restart)
169 call device_map(this%s, this%s_d, this%m_restart)
170 call device_map(this%gam, this%gam_d, this%m_restart+1)
172 allocate(this%z(n,this%m_restart))
173 allocate(this%v(n,this%m_restart))
174 allocate(this%h(this%m_restart,this%m_restart))
175 allocate(this%z_d(this%m_restart))
176 allocate(this%v_d(this%m_restart))
177 allocate(this%h_d(this%m_restart))
178 do i = 1, this%m_restart
179 this%z_d(i) = c_null_ptr
180 call device_map(this%z(:,i), this%z_d(i), n)
182 this%v_d(i) = c_null_ptr
183 call device_map(this%v(:,i), this%v_d(i), n)
185 this%h_d(i) = c_null_ptr
186 call device_map(this%h(:,i), this%h_d(i), this%m_restart)
189 z_size = c_sizeof(c_null_ptr) * (this%m_restart)
190 call device_alloc(this%z_d_d, z_size)
191 call device_alloc(this%v_d_d, z_size)
192 call device_alloc(this%h_d_d, z_size)
193 ptr = c_loc(this%z_d)
194 call device_memcpy(ptr,this%z_d_d, z_size, &
195 host_to_device, sync=.false.)
196 ptr = c_loc(this%v_d)
197 call device_memcpy(ptr,this%v_d_d, z_size, &
198 host_to_device, sync=.false.)
199 ptr = c_loc(this%h_d)
200 call device_memcpy(ptr,this%h_d_d, z_size, &
201 host_to_device, sync=.false.)
204 if (
present(rel_tol) .and.
present(abs_tol))
then
205 call this%ksp_init(max_iter, rel_tol, abs_tol)
206 else if (
present(rel_tol))
then
207 call this%ksp_init(max_iter, rel_tol=rel_tol)
208 else if (
present(abs_tol))
then
209 call this%ksp_init(max_iter, abs_tol=abs_tol)
211 call this%ksp_init(max_iter)
214 call device_event_create(this%gs_event, 2)
225 if (
allocated(this%w))
then
229 if (
allocated(this%c))
then
233 if (
allocated(this%r))
then
237 if (
allocated(this%z))
then
241 if (
allocated(this%h))
then
245 if (
allocated(this%v))
then
249 if (
allocated(this%s))
then
252 if (
allocated(this%gam))
then
256 if (
allocated(this%v_d))
then
257 do i = 1, this%m_restart
258 if (c_associated(this%v_d(i)))
then
259 call device_free(this%v_d(i))
264 if (
allocated(this%z_d))
then
265 do i = 1, this%m_restart
266 if (c_associated(this%z_d(i)))
then
267 call device_free(this%z_d(i))
271 if (
allocated(this%h_d))
then
272 do i = 1, this%m_restart
273 if (c_associated(this%h_d(i)))
then
274 call device_free(this%h_d(i))
281 if (c_associated(this%gam_d))
then
282 call device_free(this%gam_d)
284 if (c_associated(this%w_d))
then
285 call device_free(this%w_d)
287 if (c_associated(this%c_d))
then
288 call device_free(this%c_d)
290 if (c_associated(this%r_d))
then
291 call device_free(this%r_d)
293 if (c_associated(this%s_d))
then
294 call device_free(this%s_d)
299 if (c_associated(this%gs_event))
then
300 call device_event_destroy(this%gs_event)
308 class(ax_t),
intent(inout) :: ax
309 type(field_t),
intent(inout) :: x
310 integer,
intent(in) :: n
311 real(kind=rp),
dimension(n),
intent(inout) :: f
312 type(coef_t),
intent(inout) :: coef
313 type(bc_list_t),
intent(inout) :: blst
314 type(gs_t),
intent(inout) :: gs_h
315 type(ksp_monitor_t) :: ksp_results
316 integer,
optional,
intent(in) :: niter
317 integer :: iter, max_iter
319 real(kind=rp) :: rnorm, alpha, temp, lr, alpha2, norm_fac
323 f_d = device_get_ptr(f)
328 if (
present(niter))
then
331 max_iter = this%max_iter
334 associate(w => this%w, c => this%c, r => this%r, z => this%z, h => this%h, &
335 v => this%v, s => this%s, gam => this%gam, v_d => this%v_d, &
336 w_d => this%w_d, r_d => this%r_d, h_d => this%h_d, v_d_d => this%v_d_d, &
337 x_d => x%x_d, z_d_d => this%z_d_d, c_d => this%c_d)
339 norm_fac = 1.0_rp / sqrt(coef%volume)
340 call rzero(gam, this%m_restart + 1)
341 call rone(s, this%m_restart)
342 call rone(c, this%m_restart)
343 call rzero(h, this%m_restart * this%m_restart)
344 call device_rzero(x%x_d, n)
345 call device_rzero(this%gam_d, this%m_restart + 1)
346 call device_rone(this%s_d, this%m_restart)
347 call device_rone(this%c_d, this%m_restart)
349 call rzero(this%h, this%m_restart**2)
353 do while (.not. conv .and. iter .lt. max_iter)
356 call device_copy(r_d, f_d, n)
358 call device_copy(r_d, f_d, n)
359 call ax%compute(w, x%x, coef, x%msh, x%Xh)
360 call gs_h%op(w, n, gs_op_add, this%gs_event)
361 call device_event_sync(this%gs_event)
362 call bc_list_apply(blst, w, n)
363 call device_sub2(r_d, w_d, n)
366 gam(1) = sqrt(device_glsc3(r_d, r_d, coef%mult_d, n))
368 ksp_results%res_start = gam(1) * norm_fac
371 if (abscmp(gam(1), 0.0_rp))
return
374 temp = 1.0_rp / gam(1)
375 call device_cmult2(v_d(1), r_d, temp, n)
376 do j = 1, this%m_restart
379 call this%M%solve(z(1,j), v(1,j), n)
381 call ax%compute(w, z(1,j), coef, x%msh, x%Xh)
382 call gs_h%op(w, n, gs_op_add, this%gs_event)
383 call device_event_sync(this%gs_event)
384 call bc_list_apply(blst, w, n)
386 if (neko_bcknd_opencl .eq. 1)
then
388 h(i,j) = device_glsc3(w_d, v_d(i), coef%mult_d, n)
390 call device_add2s2(w_d, v_d(i), -h(i,j), n)
392 alpha2 = device_glsc3(w_d, w_d, coef%mult_d, n)
395 call device_glsc3_many(h(1,j), w_d, v_d_d, coef%mult_d, j, n)
397 call device_memcpy(h(:,j), h_d(j), j, &
398 host_to_device, sync=.false.)
407 h(i ,j) = c(i)*temp + s(i) * h(i+1,j)
408 h(i+1,j) = -s(i)*temp + c(i) * h(i+1,j)
412 if(abscmp(alpha, 0.0_rp))
then
417 lr = sqrt(h(j,j) * h(j,j) + alpha**2)
422 call device_memcpy(h(:,j), h_d(j), j, &
423 host_to_device, sync=.false.)
424 gam(j+1) = -s(j) * gam(j)
425 gam(j) = c(j) * gam(j)
427 rnorm = abs(gam(j+1)) * norm_fac
428 if (rnorm .lt. this%abs_tol)
then
433 if (iter + 1 .gt. max_iter)
exit
435 if( j .lt. this%m_restart)
then
436 temp = 1.0_rp / alpha
437 call device_cmult2(v_d(j+1), w_d, temp, n)
442 j = min(j, this%m_restart)
446 temp = temp - h(k,i) * c(i)
451 if (neko_bcknd_opencl .eq. 1)
then
453 call device_add2s2(x_d, this%z_d(i), c(i), n)
456 call device_memcpy(c, c_d, j, host_to_device, sync=.false.)
457 call device_add2s2_many(x_d, z_d_d, c_d, j, n)
463 ksp_results%res_final = rnorm
464 ksp_results%iter = iter
real cuda_gmres_part2(void *w, void *v, void *h, void *mult, int *j, int *n)
Defines a Matrix-vector product.
Defines a boundary condition.
Identity Krylov preconditioner for accelerators.
subroutine, public device_add2s1(a_d, b_d, c1, n)
subroutine, public device_rzero(a_d, n)
subroutine, public device_rone(a_d, n)
subroutine, public device_add2s2(a_d, b_d, c1, n)
subroutine, public device_cmult2(a_d, b_d, c, n)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n)
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
subroutine, public device_copy(a_d, b_d, n)
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n)
subroutine, public device_sub2(a_d, b_d, n)
Device abstraction, common interface for various accelerators.
Defines various GMRES methods.
real(c_rp) function device_gmres_part2(w_d, v_d_d, h_d, mult_d, j, n)
type(ksp_monitor_t) function gmres_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Standard GMRES solve.
subroutine gmres_device_free(this)
Deallocate a standard GMRES solver.
subroutine gmres_device_init(this, n, max_iter, M, m_restart, rel_tol, abs_tol)
Initialise a standard GMRES solver.
Implements the base abstract type for Krylov solvers plus helper types.
subroutine, public rone(a, n)
Set all elements to one.
subroutine, public rzero(a, n)
Zero a real vector.
integer, parameter, public c_rp
integer, parameter, public rp
Global precision used in computations.
Base type for a matrix-vector product providing .
A list of boundary conditions.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Defines a canonical Krylov preconditioner for accelerators.
Standard preconditioned generalized minimal residual method.
Type for storing initial and final residuals in a Krylov solver.
Base abstract type for a canonical Krylov method, solving .
Defines a canonical Krylov preconditioner.