Neko  0.8.1
A portable framework for high-order spectral element flow simulations
pipecg_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2024, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
36  use precon, only : pc_t
37  use ax_product, only : ax_t
38  use num_types, only: rp, c_rp
39  use field, only : field_t
40  use coefs, only : coef_t
41  use gather_scatter, only : gs_t, gs_op_add
42  use bc, only : bc_list_t, bc_list_apply
43  use math, only : glsc3, rzero, copy, abscmp
44  use device_math, only : device_rzero, device_copy, &
46  use device
47  use comm
48  implicit none
49  private
50 
51  integer, parameter :: device_pipecg_p_space = 10
52 
54  type, public, extends(ksp_t) :: pipecg_device_t
55  real(kind=rp), allocatable :: p(:)
56  real(kind=rp), allocatable :: q(:)
57  real(kind=rp), allocatable :: r(:)
58  real(kind=rp), allocatable :: s(:)
59  real(kind=rp), allocatable :: u(:,:)
60  real(kind=rp), allocatable :: w(:)
61  real(kind=rp), allocatable :: z(:)
62  real(kind=rp), allocatable :: mi(:)
63  real(kind=rp), allocatable :: ni(:)
64  real(kind=rp), allocatable :: alpha(:)
65  real(kind=rp), allocatable :: beta(:)
66  type(c_ptr) :: p_d = c_null_ptr
67  type(c_ptr) :: q_d = c_null_ptr
68  type(c_ptr) :: r_d = c_null_ptr
69  type(c_ptr) :: s_d = c_null_ptr
70  type(c_ptr) :: u_d_d = c_null_ptr
71  type(c_ptr) :: w_d = c_null_ptr
72  type(c_ptr) :: z_d = c_null_ptr
73  type(c_ptr) :: mi_d = c_null_ptr
74  type(c_ptr) :: ni_d = c_null_ptr
75  type(c_ptr) :: alpha_d = c_null_ptr
76  type(c_ptr) :: beta_d = c_null_ptr
77  type(c_ptr), allocatable :: u_d(:)
78  type(c_ptr) :: gs_event = c_null_ptr
79  contains
80  procedure, pass(this) :: init => pipecg_device_init
81  procedure, pass(this) :: free => pipecg_device_free
82  procedure, pass(this) :: solve => pipecg_device_solve
83  end type pipecg_device_t
84 
85 #ifdef HAVE_CUDA
86  interface
87  subroutine cuda_pipecg_vecops(p_d, q_d, r_d, s_d, u_d1, u_d2, &
88  w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction,n) &
89  bind(c, name='cuda_pipecg_vecops')
90  use, intrinsic :: iso_c_binding
91  import c_rp
92  implicit none
93  type(c_ptr), value :: p_d, q_d, r_d, s_d, u_d1, u_d2
94  type(c_ptr), value :: w_d, ni_d, mi_d, z_d, mult_d
95  integer(c_int) :: n
96  real(c_rp) :: alpha, beta, reduction(3)
97  end subroutine cuda_pipecg_vecops
98  end interface
99 
100  interface
101  subroutine cuda_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, &
102  p_cur, p_space, n) &
103  bind(c, name='cuda_cg_update_xp')
104  use, intrinsic :: iso_c_binding
105  implicit none
106  type(c_ptr), value :: x_d, p_d, u_d_d, alpha, beta
107  integer(c_int) :: p_cur, n, p_space
108  end subroutine cuda_cg_update_xp
109  end interface
110 #elif HAVE_HIP
111  interface
112  subroutine hip_pipecg_vecops(p_d, q_d, r_d, s_d, u_d1, u_d2, &
113  w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction,n) &
114  bind(c, name='hip_pipecg_vecops')
115  use, intrinsic :: iso_c_binding
116  import c_rp
117  implicit none
118  type(c_ptr), value :: p_d, q_d, r_d, s_d, u_d1, u_d2
119  type(c_ptr), value :: w_d, ni_d, mi_d, z_d, mult_d
120  integer(c_int) :: n
121  real(c_rp) :: alpha, beta, reduction(3)
122  end subroutine hip_pipecg_vecops
123  end interface
124 
125  interface
126  subroutine hip_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, &
127  p_cur, p_space, n) &
128  bind(c, name='hip_cg_update_xp')
129  use, intrinsic :: iso_c_binding
130  implicit none
131  type(c_ptr), value :: x_d, p_d, u_d_d, alpha, beta
132  integer(c_int) :: p_cur, n, p_space
133  end subroutine hip_cg_update_xp
134  end interface
135 #endif
136 
137 contains
138 
139  subroutine device_pipecg_vecops(p_d, q_d, r_d, s_d, u_d1, u_d2, &
140  w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction,n)
141  type(c_ptr), value :: p_d, q_d, r_d, s_d, u_d1, u_d2
142  type(c_ptr), value :: w_d, ni_d, mi_d, z_d, mult_d
143  integer(c_int) :: n
144  real(c_rp) :: alpha, beta, reduction(3)
145 #ifdef HAVE_HIP
146  call hip_pipecg_vecops(p_d, q_d, r_d,&
147  s_d, u_d1, u_d2, w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction,n)
148 #elif HAVE_CUDA
149  call cuda_pipecg_vecops(p_d, q_d, r_d,&
150  s_d, u_d1, u_d2, w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction,n)
151 #else
152  call neko_error('No device backend configured')
153 #endif
154  end subroutine device_pipecg_vecops
155 
156  subroutine device_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, p_cur, p_space, n)
157  use, intrinsic :: iso_c_binding
158  type(c_ptr), value :: x_d, p_d, u_d_d, alpha, beta
159  integer(c_int) :: p_cur, n, p_space
160 #ifdef HAVE_HIP
161  call hip_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, p_cur, p_space, n)
162 #elif HAVE_CUDA
163  call cuda_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, p_cur, p_space, n)
164 #else
165  call neko_error('No device backend configured')
166 #endif
167  end subroutine device_cg_update_xp
168 
170  subroutine pipecg_device_init(this, n, max_iter, M, rel_tol, abs_tol)
171  class(pipecg_device_t), target, intent(inout) :: this
172  class(pc_t), optional, intent(inout), target :: M
173  integer, intent(in) :: n
174  integer, intent(in) :: max_iter
175  real(kind=rp), optional, intent(inout) :: rel_tol
176  real(kind=rp), optional, intent(inout) :: abs_tol
177  type(c_ptr) :: ptr
178  integer(c_size_t) :: u_size
179  integer :: i
180 
181  call this%free()
182 
183  allocate(this%p(n))
184  allocate(this%q(n))
185  allocate(this%r(n))
186  allocate(this%s(n))
187  allocate(this%u(n, device_pipecg_p_space+1))
188  allocate(this%u_d(device_pipecg_p_space+1))
189  allocate(this%w(n))
190  allocate(this%z(n))
191  allocate(this%mi(n))
192  allocate(this%ni(n))
193  allocate(this%alpha(device_pipecg_p_space))
194  allocate(this%beta(device_pipecg_p_space))
195 
196  if (present(m)) then
197  this%M => m
198  end if
199 
200  call device_map(this%p, this%p_d, n)
201  call device_map(this%q, this%q_d, n)
202  call device_map(this%r, this%r_d, n)
203  call device_map(this%s, this%s_d, n)
204  call device_map(this%w, this%w_d, n)
205  call device_map(this%z, this%z_d, n)
206  call device_map(this%mi, this%mi_d, n)
207  call device_map(this%ni, this%ni_d, n)
208  call device_map(this%alpha, this%alpha_d, device_pipecg_p_space)
209  call device_map(this%beta, this%beta_d, device_pipecg_p_space)
210  do i = 1, device_pipecg_p_space+1
211  this%u_d(i) = c_null_ptr
212  call device_map(this%u(:,i), this%u_d(i), n)
213  end do
214  !Did not work with 4 for some reason...
215  u_size = 8*(device_pipecg_p_space+1)
216  call device_alloc(this%u_d_d, u_size)
217  ptr = c_loc(this%u_d)
218  call device_memcpy(ptr,this%u_d_d, u_size, &
219  host_to_device, sync=.false.)
220 
221  if (present(rel_tol) .and. present(abs_tol)) then
222  call this%ksp_init(max_iter, rel_tol, abs_tol)
223  else if (present(rel_tol)) then
224  call this%ksp_init(max_iter, rel_tol=rel_tol)
225  else if (present(abs_tol)) then
226  call this%ksp_init(max_iter, abs_tol=abs_tol)
227  else
228  call this%ksp_init(max_iter)
229  end if
230 
231  call device_event_create(this%gs_event, 2)
232 
233  end subroutine pipecg_device_init
234 
236  subroutine pipecg_device_free(this)
237  class(pipecg_device_t), intent(inout) :: this
238  integer :: i
239 
240  call this%ksp_free()
241 
242  if (allocated(this%p)) then
243  deallocate(this%p)
244  end if
245  if (allocated(this%q)) then
246  deallocate(this%q)
247  end if
248  if (allocated(this%r)) then
249  deallocate(this%r)
250  end if
251  if (allocated(this%s)) then
252  deallocate(this%s)
253  end if
254  if (allocated(this%u)) then
255  deallocate(this%u)
256  end if
257  if (allocated(this%w)) then
258  deallocate(this%w)
259  end if
260  if (allocated(this%z)) then
261  deallocate(this%z)
262  end if
263  if (allocated(this%mi)) then
264  deallocate(this%mi)
265  end if
266  if (allocated(this%ni)) then
267  deallocate(this%ni)
268  end if
269  if (allocated(this%alpha)) then
270  deallocate(this%alpha)
271  end if
272  if (allocated(this%beta)) then
273  deallocate(this%beta)
274  end if
275 
276 
277  if (c_associated(this%p_d)) then
278  call device_free(this%p_d)
279  end if
280  if (c_associated(this%q_d)) then
281  call device_free(this%q_d)
282  end if
283  if (c_associated(this%r_d)) then
284  call device_free(this%r_d)
285  end if
286  if (c_associated(this%s_d)) then
287  call device_free(this%s_d)
288  end if
289  if (c_associated(this%u_d_d)) then
290  call device_free(this%u_d_d)
291  end if
292  if (c_associated(this%w_d)) then
293  call device_free(this%w_d)
294  end if
295  if (c_associated(this%z_d)) then
296  call device_free(this%z_d)
297  end if
298  if (c_associated(this%mi_d)) then
299  call device_free(this%mi_d)
300  end if
301  if (c_associated(this%ni_d)) then
302  call device_free(this%ni_d)
303  end if
304  if (c_associated(this%alpha_d)) then
305  call device_free(this%alpha_d)
306  end if
307  if (c_associated(this%beta_d)) then
308  call device_free(this%beta_d)
309  end if
310  if (allocated(this%u_d)) then
311  do i = 1, device_pipecg_p_space
312  if (c_associated(this%u_d(i))) then
313  call device_free(this%u_d(i))
314  end if
315  end do
316  end if
317 
318  nullify(this%M)
319 
320  if (c_associated(this%gs_event)) then
321  call device_event_destroy(this%gs_event)
322  end if
323 
324  end subroutine pipecg_device_free
325 
327  function pipecg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
328  class(pipecg_device_t), intent(inout) :: this
329  class(ax_t), intent(inout) :: ax
330  type(field_t), intent(inout) :: x
331  integer, intent(in) :: n
332  real(kind=rp), dimension(n), intent(inout) :: f
333  type(coef_t), intent(inout) :: coef
334  type(bc_list_t), intent(inout) :: blst
335  type(gs_t), intent(inout) :: gs_h
336  type(ksp_monitor_t) :: ksp_results
337  integer, optional, intent(in) :: niter
338  integer :: iter, max_iter, ierr, p_cur, p_prev, u_prev
339  real(kind=rp) :: rnorm, rtr, reduction(3), norm_fac
340  real(kind=rp) :: gamma1, gamma2, delta
341  real(kind=rp) :: tmp1, tmp2, tmp3
342  type(mpi_request) :: request
343  type(mpi_status) :: status
344  type(c_ptr) :: f_d
345  f_d = device_get_ptr(f)
346 
347  if (present(niter)) then
348  max_iter = niter
349  else
350  max_iter = this%max_iter
351  end if
352  norm_fac = 1.0_rp / sqrt(coef%volume)
353 
354  associate(p => this%p, q => this%q, r => this%r, s => this%s, &
355  u => this%u, w => this%w, z => this%z, mi => this%mi, ni => this%ni, &
356  alpha => this%alpha, beta => this%beta, &
357  alpha_d => this%alpha_d, beta_d => this%beta_d, &
358  p_d => this%p_d, q_d => this%q_d, r_d => this%r_d, &
359  s_d => this%s_d, u_d => this%u_d, u_d_d => this%u_d_d, &
360  w_d => this%w_d, z_d => this%z_d, mi_d => this%mi_d, ni_d => this%ni_d)
361 
362  p_prev = device_pipecg_p_space !this%p_space
363  u_prev = device_pipecg_p_space + 1 !this%p_space+1
364  p_cur = 1
365  call device_rzero(x%x_d, n)
366  call device_rzero(z_d, n)
367  call device_rzero(q_d, n)
368  call device_rzero(p_d, n)
369  call device_rzero(s_d, n)
370  call device_copy(r_d, f_d, n)
371  !apply u=M^-1r
372  !call device_copy(u_d(u_prev), r_d, n)
373  call this%M%solve(u(1,u_prev), r, n)
374  call ax%compute(w, u(1,u_prev), coef, x%msh, x%Xh)
375  call gs_h%op(w, n, gs_op_add, this%gs_event)
376  call device_event_sync(this%gs_event)
377  call bc_list_apply(blst, w, n)
378 
379  rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
380  rnorm = sqrt(rtr)*norm_fac
381  ksp_results%res_start = rnorm
382  ksp_results%res_final = rnorm
383  ksp_results%iter = 0
384  if(abscmp(rnorm, 0.0_rp)) return
385 
386  gamma1 = 0.0_rp
387  tmp1 = 0.0_rp
388  tmp2 = 0.0_rp
389  tmp3 = 0.0_rp
390  tmp1 = device_vlsc3(r_d, coef%mult_d, u_d(u_prev), n)
391  tmp2 = device_vlsc3(w_d, coef%mult_d, u_d(u_prev), n)
392  tmp3 = device_vlsc3(r_d, coef%mult_d, r_d, n)
393  reduction(1) = tmp1
394  reduction(2) = tmp2
395  reduction(3) = tmp3
396 
397  do iter = 1, max_iter
398  call mpi_iallreduce(mpi_in_place, reduction, 3, &
399  mpi_real_precision, mpi_sum, neko_comm, request, ierr)
400 
401  call this%M%solve(mi, w, n)
402  call ax%compute(ni, mi, coef, x%msh, x%Xh)
403  call gs_h%op(ni, n, gs_op_add, this%gs_event)
404  call device_event_sync(this%gs_event)
405  call bc_list_apply(blst, ni, n)
406 
407  call mpi_wait(request, status, ierr)
408  gamma2 = gamma1
409  gamma1 = reduction(1)
410  delta = reduction(2)
411  rtr = reduction(3)
412 
413  rnorm = sqrt(rtr)*norm_fac
414  if (rnorm .lt. this%abs_tol) exit
415 
416 
417  if (iter .gt. 1) then
418  beta(p_cur) = gamma1 / gamma2
419  alpha(p_cur) = gamma1 / (delta - (beta(p_cur) * gamma1/alpha(p_prev)))
420  else
421  beta(p_cur) = 0.0_rp
422  alpha(p_cur) = gamma1/delta
423  end if
424 
425  call device_pipecg_vecops(p_d, q_d, r_d,&
426  s_d, u_d(u_prev), u_d(p_cur),&
427  w_d, z_d, ni_d,&
428  mi_d, alpha(p_cur), beta(p_cur),&
429  coef%mult_d, reduction, n)
430  if (p_cur .eq. device_pipecg_p_space) then
431  call device_memcpy(alpha, alpha_d, p_cur, &
432  host_to_device, sync=.false.)
433  call device_memcpy(beta, beta_d, p_cur, &
434  host_to_device, sync=.false.)
435  call device_cg_update_xp(x%x_d, p_d, u_d_d, alpha_d, beta_d, p_cur, &
437  p_prev = p_cur
438  u_prev = device_pipecg_p_space + 1
439  alpha(1) = alpha(p_cur)
440  beta(1) = beta(p_cur)
441  p_cur = 1
442  else
443  u_prev = p_cur
444  p_prev = p_cur
445  p_cur = p_cur + 1
446  end if
447  end do
448 
449  if ( p_cur .ne. 1) then
450  call device_memcpy(alpha, alpha_d, p_cur, host_to_device, sync=.false.)
451  call device_memcpy(beta, beta_d, p_cur, host_to_device, sync=.false.)
452  call device_cg_update_xp(x%x_d, p_d, u_d_d, alpha_d, beta_d, p_cur, &
454  end if
455 
456  ksp_results%res_final = rnorm
457  ksp_results%iter = iter
458 
459  end associate
460 
461  end function pipecg_device_solve
462 
463 end module pipecg_device
464 
465 
Return the device pointer for an associated Fortran array.
Definition: device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition: device.F90:57
Copy data between host and device (or device and device)
Definition: device.F90:51
Defines a Matrix-vector product.
Definition: ax.f90:34
Defines a boundary condition.
Definition: bc.f90:34
Coefficients.
Definition: coef.f90:34
Definition: comm.F90:1
type(mpi_comm) neko_comm
MPI communicator.
Definition: comm.F90:16
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
Definition: comm.F90:22
subroutine, public device_rzero(a_d, n)
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n)
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
subroutine, public device_copy(a_d, b_d, n)
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition: device.F90:1209
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:172
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition: device.F90:1172
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition: device.F90:151
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition: device.F90:1142
Defines a field.
Definition: field.f90:34
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition: krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition: krylov.f90:50
Definition: math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition: math.f90:810
subroutine, public copy(a, b, n)
Copy a vector .
Definition: math.f90:211
subroutine, public rzero(a, n)
Zero a real vector.
Definition: math.f90:167
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Defines a pipelined Conjugate Gradient methods.
subroutine device_pipecg_vecops(p_d, q_d, r_d, s_d, u_d1, u_d2, w_d, z_d, ni_d, mi_d, alpha, beta, mult_d, reduction, n)
type(ksp_monitor_t) function pipecg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_cg_update_xp(x_d, p_d, u_d_d, alpha, beta, p_cur, p_space, n)
integer, parameter device_pipecg_p_space
subroutine pipecg_device_init(this, n, max_iter, M, rel_tol, abs_tol)
Initialise a pipelined PCG solver.
subroutine pipecg_device_free(this)
Deallocate a pipelined PCG solver.
Krylov preconditioner.
Definition: precon.f90:34
void hip_cg_update_xp(void *x, void *p, void *u, void *alpha, void *beta, int *p_cur, int *p_space, int *n)
Definition: pipecg_aux.hip:51
void hip_pipecg_vecops(void *p, void *q, void *r, void *s, void *u1, void *u2, void *w, void *z, void *ni, void *mi, real *alpha, real *beta, void *mult, real *reduction, int *n)
Definition: pipecg_aux.hip:64
Base type for a matrix-vector product providing .
Definition: ax.f90:43
A list of boundary conditions.
Definition: bc.f90:102
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition: coef.f90:54
Type for storing initial and final residuals in a Krylov solver.
Definition: krylov.f90:55
Base abstract type for a canonical Krylov method, solving .
Definition: krylov.f90:65
Pipelined preconditioned conjugate gradient method.
Defines a canonical Krylov preconditioner.
Definition: precon.f90:40