Neko  0.9.0
A portable framework for high-order spectral element flow simulations
fusedcg_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2024, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
36  use precon, only : pc_t
37  use ax_product, only : ax_t
38  use num_types, only: rp, c_rp
39  use field, only : field_t
40  use coefs, only : coef_t
41  use gather_scatter, only : gs_t, gs_op_add
42  use bc, only : bc_list_t, bc_list_apply
43  use math, only : glsc3, rzero, copy, abscmp
45  use device
46  use comm
47  implicit none
48  private
49 
50  integer, parameter :: device_fusedcg_p_space = 10
51 
53  type, public, extends(ksp_t) :: fusedcg_device_t
54  real(kind=rp), allocatable :: w(:)
55  real(kind=rp), allocatable :: r(:)
56  real(kind=rp), allocatable :: z(:)
57  real(kind=rp), allocatable :: p(:,:)
58  real(kind=rp), allocatable :: alpha(:)
59  type(c_ptr) :: w_d = c_null_ptr
60  type(c_ptr) :: r_d = c_null_ptr
61  type(c_ptr) :: z_d = c_null_ptr
62  type(c_ptr) :: alpha_d = c_null_ptr
63  type(c_ptr) :: p_d_d = c_null_ptr
64  type(c_ptr), allocatable :: p_d(:)
65  type(c_ptr) :: gs_event = c_null_ptr
66  contains
67  procedure, pass(this) :: init => fusedcg_device_init
68  procedure, pass(this) :: free => fusedcg_device_free
69  procedure, pass(this) :: solve => fusedcg_device_solve
70  procedure, pass(this) :: solve_coupled => fusedcg_device_solve_coupled
71  end type fusedcg_device_t
72 
73 #ifdef HAVE_CUDA
74  interface
75  subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
76  bind(c, name='cuda_fusedcg_update_p')
77  use, intrinsic :: iso_c_binding
78  import c_rp
79  implicit none
80  type(c_ptr), value :: p_d, z_d, po_d
81  real(c_rp) :: beta
82  integer(c_int) :: n
83  end subroutine cuda_fusedcg_update_p
84  end interface
85 
86  interface
87  subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
88  bind(c, name='cuda_fusedcg_update_x')
89  use, intrinsic :: iso_c_binding
90  implicit none
91  type(c_ptr), value :: x_d, p_d, alpha
92  integer(c_int) :: p_cur, n
93  end subroutine cuda_fusedcg_update_x
94  end interface
95 
96  interface
97  real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
98  p_cur, n) bind(c, name='cuda_fusedcg_part2')
99  use, intrinsic :: iso_c_binding
100  import c_rp
101  implicit none
102  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
103  real(c_rp) :: alpha
104  integer(c_int) :: n, p_cur
105  end function cuda_fusedcg_part2
106  end interface
107 #elif HAVE_HIP
108  interface
109  subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
110  bind(c, name='hip_fusedcg_update_p')
111  use, intrinsic :: iso_c_binding
112  import c_rp
113  implicit none
114  type(c_ptr), value :: p_d, z_d, po_d
115  real(c_rp) :: beta
116  integer(c_int) :: n
117  end subroutine hip_fusedcg_update_p
118  end interface
119 
120  interface
121  subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
122  bind(c, name='hip_fusedcg_update_x')
123  use, intrinsic :: iso_c_binding
124  implicit none
125  type(c_ptr), value :: x_d, p_d, alpha
126  integer(c_int) :: p_cur, n
127  end subroutine hip_fusedcg_update_x
128  end interface
129 
130  interface
131  real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
132  p_cur, n) bind(c, name='hip_fusedcg_part2')
133  use, intrinsic :: iso_c_binding
134  import c_rp
135  implicit none
136  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
137  real(c_rp) :: alpha
138  integer(c_int) :: n, p_cur
139  end function hip_fusedcg_part2
140  end interface
141 #endif
142 
143 contains
144 
145  subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
146  type(c_ptr), value :: p_d, z_d, po_d
147  real(c_rp) :: beta
148  integer(c_int) :: n
149 #ifdef HAVE_HIP
150  call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
151 #elif HAVE_CUDA
152  call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
153 #else
154  call neko_error('No device backend configured')
155 #endif
156  end subroutine device_fusedcg_update_p
157 
158  subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
159  type(c_ptr), value :: x_d, p_d, alpha
160  integer(c_int) :: p_cur, n
161 #ifdef HAVE_HIP
162  call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
163 #elif HAVE_CUDA
164  call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
165 #else
166  call neko_error('No device backend configured')
167 #endif
168  end subroutine device_fusedcg_update_x
169 
170  function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
171  p_cur, n) result(res)
172  type(c_ptr), value :: a_d, b_d, c_d, alpha_d
173  real(c_rp) :: alpha
174  integer :: n, p_cur
175  real(kind=rp) :: res
176  integer :: ierr
177 #ifdef HAVE_HIP
178  res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
179 #elif HAVE_CUDA
180  res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
181 #else
182  call neko_error('No device backend configured')
183 #endif
184 
185 #ifndef HAVE_DEVICE_MPI
186  if (pe_size .gt. 1) then
187  call mpi_allreduce(mpi_in_place, res, 1, &
188  mpi_real_precision, mpi_sum, neko_comm, ierr)
189  end if
190 #endif
191 
192  end function device_fusedcg_part2
193 
195  subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, &
196  monitor)
197  class(fusedcg_device_t), target, intent(inout) :: this
198  class(pc_t), optional, intent(inout), target :: M
199  integer, intent(in) :: n
200  integer, intent(in) :: max_iter
201  real(kind=rp), optional, intent(inout) :: rel_tol
202  real(kind=rp), optional, intent(inout) :: abs_tol
203  logical, optional, intent(in) :: monitor
204  type(c_ptr) :: ptr
205  integer(c_size_t) :: p_size
206  integer :: i
207 
208  call this%free()
209 
210  allocate(this%w(n))
211  allocate(this%r(n))
212  allocate(this%z(n))
213  allocate(this%p(n, device_fusedcg_p_space))
214  allocate(this%p_d(device_fusedcg_p_space))
215  allocate(this%alpha(device_fusedcg_p_space))
216 
217  if (present(m)) then
218  this%M => m
219  end if
220 
221  call device_map(this%w, this%w_d, n)
222  call device_map(this%r, this%r_d, n)
223  call device_map(this%z, this%z_d, n)
224  call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
225  do i = 1, device_fusedcg_p_space+1
226  this%p_d(i) = c_null_ptr
227  call device_map(this%p(:,i), this%p_d(i), n)
228  end do
229 
230  p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
231  call device_alloc(this%p_d_d, p_size)
232  ptr = c_loc(this%p_d)
233  call device_memcpy(ptr, this%p_d_d, p_size, &
234  host_to_device, sync=.false.)
235  if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
236  call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
237  else if (present(rel_tol) .and. present(abs_tol)) then
238  call this%ksp_init(max_iter, rel_tol, abs_tol)
239  else if (present(monitor) .and. present(abs_tol)) then
240  call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
241  else if (present(rel_tol) .and. present(monitor)) then
242  call this%ksp_init(max_iter, rel_tol, monitor = monitor)
243  else if (present(rel_tol)) then
244  call this%ksp_init(max_iter, rel_tol = rel_tol)
245  else if (present(abs_tol)) then
246  call this%ksp_init(max_iter, abs_tol = abs_tol)
247  else if (present(monitor)) then
248  call this%ksp_init(max_iter, monitor = monitor)
249  else
250  call this%ksp_init(max_iter)
251  end if
252 
253  call device_event_create(this%gs_event, 2)
254 
255  end subroutine fusedcg_device_init
256 
258  subroutine fusedcg_device_free(this)
259  class(fusedcg_device_t), intent(inout) :: this
260  integer :: i
261 
262  call this%ksp_free()
263 
264  if (allocated(this%w)) then
265  deallocate(this%w)
266  end if
267 
268  if (allocated(this%r)) then
269  deallocate(this%r)
270  end if
271 
272 
273  if (allocated(this%z)) then
274  deallocate(this%z)
275  end if
276 
277 
278  if (allocated(this%alpha)) then
279  deallocate(this%alpha)
280  end if
281 
282  if (allocated(this%p)) then
283  deallocate(this%p)
284  end if
285 
286  if (c_associated(this%w_d)) then
287  call device_free(this%w_d)
288  end if
289 
290  if (c_associated(this%r_d)) then
291  call device_free(this%r_d)
292  end if
293 
294  if (c_associated(this%z_d)) then
295  call device_free(this%z_d)
296  end if
297 
298  if (c_associated(this%alpha_d)) then
299  call device_free(this%alpha_d)
300  end if
301 
302  if (allocated(this%p_d)) then
303  do i = 1, device_fusedcg_p_space
304  if (c_associated(this%p_d(i))) then
305  call device_free(this%p_d(i))
306  end if
307  end do
308  end if
309 
310  nullify(this%M)
311 
312  if (c_associated(this%gs_event)) then
313  call device_event_destroy(this%gs_event)
314  end if
315 
316  end subroutine fusedcg_device_free
317 
319  function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
320  class(fusedcg_device_t), intent(inout) :: this
321  class(ax_t), intent(inout) :: ax
322  type(field_t), intent(inout) :: x
323  integer, intent(in) :: n
324  real(kind=rp), dimension(n), intent(inout) :: f
325  type(coef_t), intent(inout) :: coef
326  type(bc_list_t), intent(inout) :: blst
327  type(gs_t), intent(inout) :: gs_h
328  type(ksp_monitor_t) :: ksp_results
329  integer, optional, intent(in) :: niter
330  integer :: iter, max_iter, ierr, i, p_cur, p_prev
331  real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
332  real(kind=rp) :: pap, beta
333  type(c_ptr) :: f_d
334  f_d = device_get_ptr(f)
335 
336  if (present(niter)) then
337  max_iter = niter
338  else
339  max_iter = ksp_max_iter
340  end if
341  norm_fac = 1.0_rp / sqrt(coef%volume)
342 
343  associate(w => this%w, r => this%r, p => this%p, z => this%z, &
344  alpha => this%alpha, alpha_d => this%alpha_d, &
345  w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
346  p_d => this%p_d, p_d_d => this%p_d_d)
347 
348  rtz1 = 1.0_rp
349  p_prev = device_fusedcg_p_space
350  p_cur = 1
351  call device_rzero(x%x_d, n)
352  call device_rzero(p_d(1), n)
353  call device_copy(r_d, f_d, n)
354 
355  rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
356  rnorm = sqrt(rtr)*norm_fac
357  ksp_results%res_start = rnorm
358  ksp_results%res_final = rnorm
359  ksp_results%iter = 0
360  if(abscmp(rnorm, 0.0_rp)) return
361 
362  call this%monitor_start('FusedCG')
363  do iter = 1, max_iter
364  call this%M%solve(z, r, n)
365  rtz2 = rtz1
366  rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
367  beta = rtz1 / rtz2
368  if (iter .eq. 1) beta = 0.0_rp
369  call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
370 
371  call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
372  call gs_h%op(w, n, gs_op_add, this%gs_event)
373  call device_event_sync(this%gs_event)
374  call bc_list_apply(blst, w, n)
375 
376  pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
377 
378  alpha(p_cur) = rtz1 / pap
379  rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
380  alpha_d, alpha(p_cur), p_cur, n)
381  rnorm = sqrt(rtr)*norm_fac
382  call this%monitor_iter(iter, rnorm)
383  if ((p_cur .eq. device_fusedcg_p_space) .or. &
384  (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
385  call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
386  p_prev = p_cur
387  p_cur = 1
388  if (rnorm .lt. this%abs_tol) exit
389  else
390  p_prev = p_cur
391  p_cur = p_cur + 1
392  end if
393  end do
394  call this%monitor_stop()
395  ksp_results%res_final = rnorm
396  ksp_results%iter = iter
397 
398  end associate
399 
400  end function fusedcg_device_solve
401 
403  function fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
404  n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
405  class(fusedcg_device_t), intent(inout) :: this
406  class(ax_t), intent(inout) :: ax
407  type(field_t), intent(inout) :: x
408  type(field_t), intent(inout) :: y
409  type(field_t), intent(inout) :: z
410  integer, intent(in) :: n
411  real(kind=rp), dimension(n), intent(inout) :: fx
412  real(kind=rp), dimension(n), intent(inout) :: fy
413  real(kind=rp), dimension(n), intent(inout) :: fz
414  type(coef_t), intent(inout) :: coef
415  type(bc_list_t), intent(inout) :: blstx
416  type(bc_list_t), intent(inout) :: blsty
417  type(bc_list_t), intent(inout) :: blstz
418  type(gs_t), intent(inout) :: gs_h
419  type(ksp_monitor_t), dimension(3) :: ksp_results
420  integer, optional, intent(in) :: niter
421 
422  ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
423  ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
424  ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
425 
426  end function fusedcg_device_solve_coupled
427 
428 end module fusedcg_device
429 
430 
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
Definition: fusedcg_aux.hip:67
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
Definition: fusedcg_aux.hip:81
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Definition: fusedcg_aux.hip:54
Defines a Matrix-vector product.
Definition: ax.f90:34
Defines a boundary condition.
Definition: bc.f90:34
Coefficients.
Definition: coef.f90:34
Definition: comm.F90:1
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Definition: device_math.F90:76
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
Defines a field.
Definition: field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition: krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition: krylov.f90:51
Definition: math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition: math.f90:895
subroutine, public copy(a, b, n)
Copy a vector .
Definition: math.f90:239
subroutine, public rzero(a, n)
Zero a real vector.
Definition: math.f90:195
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Krylov preconditioner.
Definition: precon.f90:34
Base type for a matrix-vector product providing .
Definition: ax.f90:43
A list of boundary conditions.
Definition: bc.f90:104
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition: coef.f90:55
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition: krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition: krylov.f90:66
Defines a canonical Krylov preconditioner.
Definition: precon.f90:40