Neko  0.8.1
A portable framework for high-order spectral element flow simulations
gmres_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2022-2024, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
35  use krylov, only : ksp_t, ksp_monitor_t
36  use precon, only : pc_t
37  use ax_product, only : ax_t
38  use num_types, only: rp, c_rp
39  use field, only : field_t
40  use coefs, only : coef_t
41  use gather_scatter, only : gs_t, gs_op_add
42  use bc, only : bc_list_t, bc_list_apply
44  use math, only : rone, rzero, abscmp
49  use device
50  use comm
51  use, intrinsic :: iso_c_binding
52  implicit none
53  private
54 
56  type, public, extends(ksp_t) :: gmres_device_t
57  integer :: m_restart
58  real(kind=rp), allocatable :: w(:)
59  real(kind=rp), allocatable :: c(:)
60  real(kind=rp), allocatable :: r(:)
61  real(kind=rp), allocatable :: z(:,:)
62  real(kind=rp), allocatable :: h(:,:)
63  real(kind=rp), allocatable :: v(:,:)
64  real(kind=rp), allocatable :: s(:)
65  real(kind=rp), allocatable :: gam(:)
66  type(c_ptr) :: w_d = c_null_ptr
67  type(c_ptr) :: c_d = c_null_ptr
68  type(c_ptr) :: r_d = c_null_ptr
69  type(c_ptr) :: s_d = c_null_ptr
70  type(c_ptr) :: gam_d = c_null_ptr
71  type(c_ptr), allocatable :: z_d(:), h_d(:), v_d(:)
72  type(c_ptr) :: z_d_d = c_null_ptr
73  type(c_ptr) :: h_d_d = c_null_ptr
74  type(c_ptr) :: v_d_d = c_null_ptr
75  type(c_ptr) :: gs_event = c_null_ptr
76  contains
77  procedure, pass(this) :: init => gmres_device_init
78  procedure, pass(this) :: free => gmres_device_free
79  procedure, pass(this) :: solve => gmres_device_solve
80  end type gmres_device_t
81 
82 #ifdef HAVE_HIP
83  interface
84  real(c_rp) function hip_gmres_part2(w_d,v_d_d,h_d,mult_d,j,n) &
85  bind(c, name='hip_gmres_part2')
86  use, intrinsic :: iso_c_binding
87  import c_rp
88  implicit none
89  type(c_ptr), value :: h_d, w_d, v_d_d, mult_d
90  integer(c_int) :: j, n
91  end function hip_gmres_part2
92  end interface
93 #elif HAVE_CUDA
94 
95  interface
96  real(c_rp) function cuda_gmres_part2(w_d,v_d_d,h_d,mult_d,j,n) &
97  bind(c, name='cuda_gmres_part2')
98  use, intrinsic :: iso_c_binding
99  import c_rp
100  implicit none
101  type(c_ptr), value :: h_d, w_d, v_d_d, mult_d
102  integer(c_int) :: j, n
103  end function cuda_gmres_part2
104  end interface
105 #endif
106 
107 contains
108 
109  function device_gmres_part2(w_d,v_d_d,h_d,mult_d,j,n) result(alpha)
110  type(c_ptr), value :: h_d, w_d, v_d_d, mult_d
111  integer(c_int) :: j, n
112  real(c_rp) :: alpha
113  integer :: ierr
114 #ifdef HAVE_HIP
115  alpha = hip_gmres_part2(w_d,v_d_d,h_d,mult_d,j,n)
116 #elif HAVE_CUDA
117  alpha = cuda_gmres_part2(w_d,v_d_d,h_d,mult_d,j,n)
118 #else
119  call neko_error('No device backend configured')
120 #endif
121 
122 #ifndef HAVE_DEVICE_MPI
123  if (pe_size .gt. 1) then
124  call mpi_allreduce(mpi_in_place, alpha, 1, &
125  mpi_real_precision, mpi_sum, neko_comm, ierr)
126  end if
127 #endif
128 
129  end function device_gmres_part2
130 
132  subroutine gmres_device_init(this, n, max_iter, M, m_restart, rel_tol, abs_tol)
133  class(gmres_device_t), target, intent(inout) :: this
134  integer, intent(in) :: n
135  integer, intent(in) :: max_iter
136  class(pc_t), optional, intent(inout), target :: M
137  integer, optional, intent(inout) :: m_restart
138  real(kind=rp), optional, intent(inout) :: rel_tol
139  real(kind=rp), optional, intent(inout) :: abs_tol
140  type(device_ident_t), target :: m_ident
141  type(c_ptr) :: ptr
142  integer(c_size_t) :: z_size
143  integer :: i
144 
145  if (present(m_restart)) then
146  this%m_restart = m_restart
147  else
148  this%m_restart = 30
149  end if
150 
151 
152  call this%free()
153 
154  if (present(m)) then
155  this%M => m
156  else
157  this%M => m_ident
158  end if
159 
160  allocate(this%w(n))
161  allocate(this%r(n))
162  call device_map(this%w, this%w_d, n)
163  call device_map(this%r, this%r_d, n)
164 
165  allocate(this%c(this%m_restart))
166  allocate(this%s(this%m_restart))
167  allocate(this%gam(this%m_restart + 1))
168  call device_map(this%c, this%c_d, this%m_restart)
169  call device_map(this%s, this%s_d, this%m_restart)
170  call device_map(this%gam, this%gam_d, this%m_restart+1)
171 
172  allocate(this%z(n,this%m_restart))
173  allocate(this%v(n,this%m_restart))
174  allocate(this%h(this%m_restart,this%m_restart))
175  allocate(this%z_d(this%m_restart))
176  allocate(this%v_d(this%m_restart))
177  allocate(this%h_d(this%m_restart))
178  do i = 1, this%m_restart
179  this%z_d(i) = c_null_ptr
180  call device_map(this%z(:,i), this%z_d(i), n)
181 
182  this%v_d(i) = c_null_ptr
183  call device_map(this%v(:,i), this%v_d(i), n)
184 
185  this%h_d(i) = c_null_ptr
186  call device_map(this%h(:,i), this%h_d(i), this%m_restart)
187  end do
188 
189  z_size = c_sizeof(c_null_ptr) * (this%m_restart)
190  call device_alloc(this%z_d_d, z_size)
191  call device_alloc(this%v_d_d, z_size)
192  call device_alloc(this%h_d_d, z_size)
193  ptr = c_loc(this%z_d)
194  call device_memcpy(ptr,this%z_d_d, z_size, &
195  host_to_device, sync=.false.)
196  ptr = c_loc(this%v_d)
197  call device_memcpy(ptr,this%v_d_d, z_size, &
198  host_to_device, sync=.false.)
199  ptr = c_loc(this%h_d)
200  call device_memcpy(ptr,this%h_d_d, z_size, &
201  host_to_device, sync=.false.)
202 
203 
204  if (present(rel_tol) .and. present(abs_tol)) then
205  call this%ksp_init(max_iter, rel_tol, abs_tol)
206  else if (present(rel_tol)) then
207  call this%ksp_init(max_iter, rel_tol=rel_tol)
208  else if (present(abs_tol)) then
209  call this%ksp_init(max_iter, abs_tol=abs_tol)
210  else
211  call this%ksp_init(max_iter)
212  end if
213 
214  call device_event_create(this%gs_event, 2)
215 
216  end subroutine gmres_device_init
217 
219  subroutine gmres_device_free(this)
220  class(gmres_device_t), intent(inout) :: this
221  integer :: i
222 
223  call this%ksp_free()
224 
225  if (allocated(this%w)) then
226  deallocate(this%w)
227  end if
228 
229  if (allocated(this%c)) then
230  deallocate(this%c)
231  end if
232 
233  if (allocated(this%r)) then
234  deallocate(this%r)
235  end if
236 
237  if (allocated(this%z)) then
238  deallocate(this%z)
239  end if
240 
241  if (allocated(this%h)) then
242  deallocate(this%h)
243  end if
244 
245  if (allocated(this%v)) then
246  deallocate(this%v)
247  end if
248 
249  if (allocated(this%s)) then
250  deallocate(this%s)
251  end if
252  if (allocated(this%gam)) then
253  deallocate(this%gam)
254  end if
255 
256  if (allocated(this%v_d)) then
257  do i = 1, this%m_restart
258  if (c_associated(this%v_d(i))) then
259  call device_free(this%v_d(i))
260  end if
261  end do
262  end if
263 
264  if (allocated(this%z_d)) then
265  do i = 1, this%m_restart
266  if (c_associated(this%z_d(i))) then
267  call device_free(this%z_d(i))
268  end if
269  end do
270  end if
271  if (allocated(this%h_d)) then
272  do i = 1, this%m_restart
273  if (c_associated(this%h_d(i))) then
274  call device_free(this%h_d(i))
275  end if
276  end do
277  end if
278 
279 
280 
281  if (c_associated(this%gam_d)) then
282  call device_free(this%gam_d)
283  end if
284  if (c_associated(this%w_d)) then
285  call device_free(this%w_d)
286  end if
287  if (c_associated(this%c_d)) then
288  call device_free(this%c_d)
289  end if
290  if (c_associated(this%r_d)) then
291  call device_free(this%r_d)
292  end if
293  if (c_associated(this%s_d)) then
294  call device_free(this%s_d)
295  end if
296 
297  nullify(this%M)
298 
299  if (c_associated(this%gs_event)) then
300  call device_event_destroy(this%gs_event)
301  end if
302 
303  end subroutine gmres_device_free
304 
306  function gmres_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
307  class(gmres_device_t), intent(inout) :: this
308  class(ax_t), intent(inout) :: ax
309  type(field_t), intent(inout) :: x
310  integer, intent(in) :: n
311  real(kind=rp), dimension(n), intent(inout) :: f
312  type(coef_t), intent(inout) :: coef
313  type(bc_list_t), intent(inout) :: blst
314  type(gs_t), intent(inout) :: gs_h
315  type(ksp_monitor_t) :: ksp_results
316  integer, optional, intent(in) :: niter
317  integer :: iter, max_iter
318  integer :: i, j, k
319  real(kind=rp) :: rnorm, alpha, temp, lr, alpha2, norm_fac
320  logical :: conv
321  type(c_ptr) :: f_d
322 
323  f_d = device_get_ptr(f)
324 
325  conv = .false.
326  iter = 0
327 
328  if (present(niter)) then
329  max_iter = niter
330  else
331  max_iter = this%max_iter
332  end if
333 
334  associate(w => this%w, c => this%c, r => this%r, z => this%z, h => this%h, &
335  v => this%v, s => this%s, gam => this%gam, v_d => this%v_d, &
336  w_d => this%w_d, r_d => this%r_d, h_d => this%h_d, v_d_d => this%v_d_d, &
337  x_d => x%x_d, z_d_d => this%z_d_d, c_d => this%c_d)
338 
339  norm_fac = 1.0_rp / sqrt(coef%volume)
340  call rzero(gam, this%m_restart + 1)
341  call rone(s, this%m_restart)
342  call rone(c, this%m_restart)
343  call rzero(h, this%m_restart * this%m_restart)
344  call device_rzero(x%x_d, n)
345  call device_rzero(this%gam_d, this%m_restart + 1)
346  call device_rone(this%s_d, this%m_restart)
347  call device_rone(this%c_d, this%m_restart)
348 
349  call rzero(this%h, this%m_restart**2)
350 ! do j = 1, this%m_restart
351 ! call device_rzero(h_d(j), this%m_restart)
352 ! end do
353  do while (.not. conv .and. iter .lt. max_iter)
354 
355  if(iter.eq.0) then
356  call device_copy(r_d, f_d, n)
357  else
358  call device_copy(r_d, f_d, n)
359  call ax%compute(w, x%x, coef, x%msh, x%Xh)
360  call gs_h%op(w, n, gs_op_add, this%gs_event)
361  call device_event_sync(this%gs_event)
362  call bc_list_apply(blst, w, n)
363  call device_sub2(r_d, w_d, n)
364  end if
365 
366  gam(1) = sqrt(device_glsc3(r_d, r_d, coef%mult_d, n))
367  if(iter.eq.0) then
368  ksp_results%res_start = gam(1) * norm_fac
369  end if
370 
371  if (abscmp(gam(1), 0.0_rp)) return
372 
373  rnorm = 0.0_rp
374  temp = 1.0_rp / gam(1)
375  call device_cmult2(v_d(1), r_d, temp, n)
376  do j = 1, this%m_restart
377  iter = iter+1
378 
379  call this%M%solve(z(1,j), v(1,j), n)
380 
381  call ax%compute(w, z(1,j), coef, x%msh, x%Xh)
382  call gs_h%op(w, n, gs_op_add, this%gs_event)
383  call device_event_sync(this%gs_event)
384  call bc_list_apply(blst, w, n)
385 
386  if (neko_bcknd_opencl .eq. 1) then
387  do i = 1, j
388  h(i,j) = device_glsc3(w_d, v_d(i), coef%mult_d, n)
389 
390  call device_add2s2(w_d, v_d(i), -h(i,j), n)
391 
392  alpha2 = device_glsc3(w_d, w_d, coef%mult_d, n)
393  end do
394  else
395  call device_glsc3_many(h(1,j), w_d, v_d_d, coef%mult_d, j, n)
396 
397  call device_memcpy(h(:,j), h_d(j), j, &
398  host_to_device, sync=.false.)
399 
400  alpha2 = device_gmres_part2(w_d, v_d_d, h_d(j), coef%mult_d, j, n)
401 
402  end if
403 
404  alpha = sqrt(alpha2)
405  do i=1,j-1
406  temp = h(i,j)
407  h(i ,j) = c(i)*temp + s(i) * h(i+1,j)
408  h(i+1,j) = -s(i)*temp + c(i) * h(i+1,j)
409  end do
410 
411  rnorm = 0.0_rp
412  if(abscmp(alpha, 0.0_rp)) then
413  conv = .true.
414  exit
415  end if
416 
417  lr = sqrt(h(j,j) * h(j,j) + alpha**2)
418  temp = 1.0_rp / lr
419  c(j) = h(j,j) * temp
420  s(j) = alpha * temp
421  h(j,j) = lr
422  call device_memcpy(h(:,j), h_d(j), j, &
423  host_to_device, sync=.false.)
424  gam(j+1) = -s(j) * gam(j)
425  gam(j) = c(j) * gam(j)
426 
427  rnorm = abs(gam(j+1)) * norm_fac
428  if (rnorm .lt. this%abs_tol) then
429  conv = .true.
430  exit
431  end if
432 
433  if (iter + 1 .gt. max_iter) exit
434 
435  if( j .lt. this%m_restart) then
436  temp = 1.0_rp / alpha
437  call device_cmult2(v_d(j+1), w_d, temp, n)
438  end if
439 
440  end do
441 
442  j = min(j, this%m_restart)
443  do k = j, 1, -1
444  temp = gam(k)
445  do i = j, k+1, -1
446  temp = temp - h(k,i) * c(i)
447  end do
448  c(k) = temp / h(k,k)
449  end do
450 
451  if (neko_bcknd_opencl .eq. 1) then
452  do i = 1, j
453  call device_add2s2(x_d, this%z_d(i), c(i), n)
454  end do
455  else
456  call device_memcpy(c, c_d, j, host_to_device, sync=.false.)
457  call device_add2s2_many(x_d, z_d_d, c_d, j, n)
458  end if
459  end do
460 
461  end associate
462 
463  ksp_results%res_final = rnorm
464  ksp_results%iter = iter
465 
466  end function gmres_device_solve
467 
468 end module gmres_device
469 
470 
real cuda_gmres_part2(void *w, void *v, void *h, void *mult, int *j, int *n)
Definition: gmres_aux.cu:52
Defines a Matrix-vector product.
Definition: ax.f90:34
Defines a boundary condition.
Definition: bc.f90:34
Coefficients.
Definition: coef.f90:34
Definition: comm.F90:1
Identity Krylov preconditioner for accelerators.
subroutine, public device_add2s1(a_d, b_d, c1, n)
subroutine, public device_rzero(a_d, n)
subroutine, public device_rone(a_d, n)
subroutine, public device_add2s2(a_d, b_d, c1, n)
subroutine, public device_cmult2(a_d, b_d, c, n)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n)
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
subroutine, public device_copy(a_d, b_d, n)
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n)
subroutine, public device_sub2(a_d, b_d, n)
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
Defines a field.
Definition: field.f90:34
Gather-scatter.
Defines various GMRES methods.
real(c_rp) function device_gmres_part2(w_d, v_d_d, h_d, mult_d, j, n)
type(ksp_monitor_t) function gmres_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Standard GMRES solve.
subroutine gmres_device_free(this)
Deallocate a standard GMRES solver.
subroutine gmres_device_init(this, n, max_iter, M, m_restart, rel_tol, abs_tol)
Initialise a standard GMRES solver.
Implements the base abstract type for Krylov solvers plus helper types.
Definition: krylov.f90:34
Definition: math.f90:60
subroutine, public rone(a, n)
Set all elements to one.
Definition: math.f90:200
subroutine, public rzero(a, n)
Zero a real vector.
Definition: math.f90:167
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Krylov preconditioner.
Definition: precon.f90:34
Base type for a matrix-vector product providing .
Definition: ax.f90:43
A list of boundary conditions.
Definition: bc.f90:102
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition: coef.f90:54
Defines a canonical Krylov preconditioner for accelerators.
Standard preconditioned generalized minimal residual method.
Type for storing initial and final residuals in a Krylov solver.
Definition: krylov.f90:55
Base abstract type for a canonical Krylov method, solving .
Definition: krylov.f90:65
Defines a canonical Krylov preconditioner.
Definition: precon.f90:40