Neko  0.8.1
A portable framework for high-order spectral element flow simulations
fusedcg_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2024, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
36  use precon, only : pc_t
37  use ax_product, only : ax_t
38  use num_types, only: rp, c_rp
39  use field, only : field_t
40  use coefs, only : coef_t
41  use gather_scatter, only : gs_t, gs_op_add
42  use bc, only : bc_list_t, bc_list_apply
43  use math, only : glsc3, rzero, copy, abscmp
45  use device
46  use comm
47  implicit none
48  private
49 
50  integer, parameter :: device_fusedcg_p_space = 10
51 
53  type, public, extends(ksp_t) :: fusedcg_device_t
54  real(kind=rp), allocatable :: w(:)
55  real(kind=rp), allocatable :: r(:)
56  real(kind=rp), allocatable :: z(:)
57  real(kind=rp), allocatable :: p(:,:)
58  real(kind=rp), allocatable :: alpha(:)
59  type(c_ptr) :: w_d = c_null_ptr
60  type(c_ptr) :: r_d = c_null_ptr
61  type(c_ptr) :: z_d = c_null_ptr
62  type(c_ptr) :: alpha_d = c_null_ptr
63  type(c_ptr) :: p_d_d = c_null_ptr
64  type(c_ptr), allocatable :: p_d(:)
65  type(c_ptr) :: gs_event = c_null_ptr
66  contains
67  procedure, pass(this) :: init => fusedcg_device_init
68  procedure, pass(this) :: free => fusedcg_device_free
69  procedure, pass(this) :: solve => fusedcg_device_solve
70  end type fusedcg_device_t
71 
72 #ifdef HAVE_CUDA
73  interface
74  subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
75  bind(c, name='cuda_fusedcg_update_p')
76  use, intrinsic :: iso_c_binding
77  import c_rp
78  implicit none
79  type(c_ptr), value :: p_d, z_d, po_d
80  real(c_rp) :: beta
81  integer(c_int) :: n
82  end subroutine cuda_fusedcg_update_p
83  end interface
84 
85  interface
86  subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
87  bind(c, name='cuda_fusedcg_update_x')
88  use, intrinsic :: iso_c_binding
89  implicit none
90  type(c_ptr), value :: x_d, p_d, alpha
91  integer(c_int) :: p_cur, n
92  end subroutine cuda_fusedcg_update_x
93  end interface
94 
95  interface
96  real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
97  p_cur, n) bind(c, name='cuda_fusedcg_part2')
98  use, intrinsic :: iso_c_binding
99  import c_rp
100  implicit none
101  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
102  real(c_rp) :: alpha
103  integer(c_int) :: n, p_cur
104  end function cuda_fusedcg_part2
105  end interface
106 #elif HAVE_HIP
107  interface
108  subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
109  bind(c, name='hip_fusedcg_update_p')
110  use, intrinsic :: iso_c_binding
111  import c_rp
112  implicit none
113  type(c_ptr), value :: p_d, z_d, po_d
114  real(c_rp) :: beta
115  integer(c_int) :: n
116  end subroutine hip_fusedcg_update_p
117  end interface
118 
119  interface
120  subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
121  bind(c, name='hip_fusedcg_update_x')
122  use, intrinsic :: iso_c_binding
123  implicit none
124  type(c_ptr), value :: x_d, p_d, alpha
125  integer(c_int) :: p_cur, n
126  end subroutine hip_fusedcg_update_x
127  end interface
128 
129  interface
130  real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
131  p_cur, n) bind(c, name='hip_fusedcg_part2')
132  use, intrinsic :: iso_c_binding
133  import c_rp
134  implicit none
135  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
136  real(c_rp) :: alpha
137  integer(c_int) :: n, p_cur
138  end function hip_fusedcg_part2
139  end interface
140 #endif
141 
142 contains
143 
144  subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
145  type(c_ptr), value :: p_d, z_d, po_d
146  real(c_rp) :: beta
147  integer(c_int) :: n
148 #ifdef HAVE_HIP
149  call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
150 #elif HAVE_CUDA
151  call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
152 #else
153  call neko_error('No device backend configured')
154 #endif
155  end subroutine device_fusedcg_update_p
156 
157  subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
158  type(c_ptr), value :: x_d, p_d, alpha
159  integer(c_int) :: p_cur, n
160 #ifdef HAVE_HIP
161  call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
162 #elif HAVE_CUDA
163  call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
164 #else
165  call neko_error('No device backend configured')
166 #endif
167  end subroutine device_fusedcg_update_x
168 
169  function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
170  p_cur, n) result(res)
171  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
172  real(c_rp) :: alpha
173  integer :: n, p_cur
174  real(kind=rp) :: res
175  integer :: ierr
176 #ifdef HAVE_HIP
177  res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
178 #elif HAVE_CUDA
179  res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
180 #else
181  call neko_error('No device backend configured')
182 #endif
183 
184 #ifndef HAVE_DEVICE_MPI
185  if (pe_size .gt. 1) then
186  call mpi_allreduce(mpi_in_place, res, 1, &
187  mpi_real_precision, mpi_sum, neko_comm, ierr)
188  end if
189 #endif
190 
191  end function device_fusedcg_part2
192 
194  subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol)
195  class(fusedcg_device_t), target, intent(inout) :: this
196  class(pc_t), optional, intent(inout), target :: M
197  integer, intent(in) :: n
198  integer, intent(in) :: max_iter
199  real(kind=rp), optional, intent(inout) :: rel_tol
200  real(kind=rp), optional, intent(inout) :: abs_tol
201  type(c_ptr) :: ptr
202  integer(c_size_t) :: p_size
203  integer :: i
204 
205  call this%free()
206 
207  allocate(this%w(n))
208  allocate(this%r(n))
209  allocate(this%z(n))
210  allocate(this%p(n, device_fusedcg_p_space))
211  allocate(this%p_d(device_fusedcg_p_space))
212  allocate(this%alpha(device_fusedcg_p_space))
213 
214  if (present(m)) then
215  this%M => m
216  end if
217 
218  call device_map(this%w, this%w_d, n)
219  call device_map(this%r, this%r_d, n)
220  call device_map(this%z, this%z_d, n)
221  call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
222  do i = 1, device_fusedcg_p_space+1
223  this%p_d(i) = c_null_ptr
224  call device_map(this%p(:,i), this%p_d(i), n)
225  end do
226 
227  p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
228  call device_alloc(this%p_d_d, p_size)
229  ptr = c_loc(this%p_d)
230  call device_memcpy(ptr, this%p_d_d, p_size, &
231  host_to_device, sync=.false.)
232  if (present(rel_tol) .and. present(abs_tol)) then
233  call this%ksp_init(max_iter, rel_tol, abs_tol)
234  else if (present(rel_tol)) then
235  call this%ksp_init(max_iter, rel_tol=rel_tol)
236  else if (present(abs_tol)) then
237  call this%ksp_init(max_iter, abs_tol=abs_tol)
238  else
239  call this%ksp_init(max_iter)
240  end if
241 
242  call device_event_create(this%gs_event, 2)
243 
244  end subroutine fusedcg_device_init
245 
247  subroutine fusedcg_device_free(this)
248  class(fusedcg_device_t), intent(inout) :: this
249  integer :: i
250 
251  call this%ksp_free()
252 
253  if (allocated(this%w)) then
254  deallocate(this%w)
255  end if
256 
257  if (allocated(this%r)) then
258  deallocate(this%r)
259  end if
260 
261 
262  if (allocated(this%z)) then
263  deallocate(this%z)
264  end if
265 
266 
267  if (allocated(this%alpha)) then
268  deallocate(this%alpha)
269  end if
270 
271  if (allocated(this%p)) then
272  deallocate(this%p)
273  end if
274 
275  if (c_associated(this%w_d)) then
276  call device_free(this%w_d)
277  end if
278 
279  if (c_associated(this%r_d)) then
280  call device_free(this%r_d)
281  end if
282 
283  if (c_associated(this%z_d)) then
284  call device_free(this%z_d)
285  end if
286 
287  if (c_associated(this%alpha_d)) then
288  call device_free(this%alpha_d)
289  end if
290 
291  if (allocated(this%p_d)) then
292  do i = 1, device_fusedcg_p_space
293  if (c_associated(this%p_d(i))) then
294  call device_free(this%p_d(i))
295  end if
296  end do
297  end if
298 
299  nullify(this%M)
300 
301  if (c_associated(this%gs_event)) then
302  call device_event_destroy(this%gs_event)
303  end if
304 
305  end subroutine fusedcg_device_free
306 
308  function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
309  class(fusedcg_device_t), intent(inout) :: this
310  class(ax_t), intent(inout) :: ax
311  type(field_t), intent(inout) :: x
312  integer, intent(in) :: n
313  real(kind=rp), dimension(n), intent(inout) :: f
314  type(coef_t), intent(inout) :: coef
315  type(bc_list_t), intent(inout) :: blst
316  type(gs_t), intent(inout) :: gs_h
317  type(ksp_monitor_t) :: ksp_results
318  integer, optional, intent(in) :: niter
319  integer :: iter, max_iter, ierr, i, p_cur, p_prev
320  real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
321  real(kind=rp) :: pap, beta
322  type(c_ptr) :: f_d
323  f_d = device_get_ptr(f)
324 
325  if (present(niter)) then
326  max_iter = niter
327  else
328  max_iter = ksp_max_iter
329  end if
330  norm_fac = 1.0_rp / sqrt(coef%volume)
331 
332  associate(w => this%w, r => this%r, p => this%p, z => this%z, &
333  alpha => this%alpha, alpha_d => this%alpha_d, &
334  w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
335  p_d => this%p_d, p_d_d => this%p_d_d)
336 
337  rtz1 = 1.0_rp
338  p_prev = device_fusedcg_p_space
339  p_cur = 1
340  call device_rzero(x%x_d, n)
341  call device_rzero(p_d(1), n)
342  call device_copy(r_d, f_d, n)
343 
344  rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
345  rnorm = sqrt(rtr)*norm_fac
346  ksp_results%res_start = rnorm
347  ksp_results%res_final = rnorm
348  ksp_results%iter = 0
349  if(abscmp(rnorm, 0.0_rp)) return
350 
351  do iter = 1, max_iter
352  call this%M%solve(z, r, n)
353  rtz2 = rtz1
354  rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
355  beta = rtz1 / rtz2
356  if (iter .eq. 1) beta = 0.0_rp
357  call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
358 
359  call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
360  call gs_h%op(w, n, gs_op_add, this%gs_event)
361  call device_event_sync(this%gs_event)
362  call bc_list_apply(blst, w, n)
363 
364  pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
365 
366  alpha(p_cur) = rtz1 / pap
367  rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
368  alpha_d, alpha(p_cur), p_cur, n)
369  rnorm = sqrt(rtr)*norm_fac
370 
371  if ((p_cur .eq. device_fusedcg_p_space) .or. &
372  (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
373  call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
374  p_prev = p_cur
375  p_cur = 1
376  if (rnorm .lt. this%abs_tol) exit
377  else
378  p_prev = p_cur
379  p_cur = p_cur + 1
380  end if
381  end do
382 
383  ksp_results%res_final = rnorm
384  ksp_results%iter = iter
385 
386  end associate
387 
388  end function fusedcg_device_solve
389 
390 end module fusedcg_device
391 
392 
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
Definition: fusedcg_aux.hip:67
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
Definition: fusedcg_aux.hip:81
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Definition: fusedcg_aux.hip:54
Defines a Matrix-vector product.
Definition: ax.f90:34
Defines a boundary condition.
Definition: bc.f90:34
Coefficients.
Definition: coef.f90:34
Definition: comm.F90:1
subroutine, public device_rzero(a_d, n)
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
subroutine, public device_copy(a_d, b_d, n)
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
Defines a field.
Definition: field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol)
Initialise a fused PCG solver.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition: krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition: krylov.f90:50
Definition: math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition: math.f90:810
subroutine, public copy(a, b, n)
Copy a vector .
Definition: math.f90:211
subroutine, public rzero(a, n)
Zero a real vector.
Definition: math.f90:167
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Krylov preconditioner.
Definition: precon.f90:34
Base type for a matrix-vector product providing .
Definition: ax.f90:43
A list of boundary conditions.
Definition: bc.f90:102
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition: coef.f90:54
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition: krylov.f90:55
Base abstract type for a canonical Krylov method, solving .
Definition: krylov.f90:65
Defines a canonical Krylov preconditioner.
Definition: precon.f90:40