Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
fusedcg_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2024, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
36 use precon, only : pc_t
37 use ax_product, only : ax_t
38 use num_types, only: rp, c_rp
39 use field, only : field_t
40 use coefs, only : coef_t
41 use gather_scatter, only : gs_t, gs_op_add
42 use bc_list, only : bc_list_t
43 use math, only : glsc3, rzero, copy, abscmp
48 use utils, only : neko_error
50 use mpi_f08, only : mpi_allreduce, mpi_in_place, mpi_sum
51 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, &
52 c_associated, c_size_t, c_sizeof, c_int, c_loc
53 implicit none
54 private
55
56 integer, parameter :: device_fusedcg_p_space = 10
57
59 type, public, extends(ksp_t) :: fusedcg_device_t
60 real(kind=rp), allocatable :: w(:)
61 real(kind=rp), allocatable :: r(:)
62 real(kind=rp), allocatable :: z(:)
63 real(kind=rp), allocatable :: p(:,:)
64 real(kind=rp), allocatable :: alpha(:)
65 type(c_ptr) :: w_d = c_null_ptr
66 type(c_ptr) :: r_d = c_null_ptr
67 type(c_ptr) :: z_d = c_null_ptr
68 type(c_ptr) :: alpha_d = c_null_ptr
69 type(c_ptr) :: p_d_d = c_null_ptr
70 type(c_ptr), allocatable :: p_d(:)
71 type(c_ptr) :: gs_event = c_null_ptr
72 contains
73 procedure, pass(this) :: init => fusedcg_device_init
74 procedure, pass(this) :: free => fusedcg_device_free
75 procedure, pass(this) :: solve => fusedcg_device_solve
76 procedure, pass(this) :: solve_coupled => fusedcg_device_solve_coupled
77 end type fusedcg_device_t
78
79#ifdef HAVE_CUDA
80 interface
81 subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
82 bind(c, name='cuda_fusedcg_update_p')
83 use, intrinsic :: iso_c_binding
84 import c_rp
85 implicit none
86 type(c_ptr), value :: p_d, z_d, po_d
87 real(c_rp) :: beta
88 integer(c_int) :: n
89 end subroutine cuda_fusedcg_update_p
90 end interface
91
92 interface
93 subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
94 bind(c, name='cuda_fusedcg_update_x')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 type(c_ptr), value :: x_d, p_d, alpha
98 integer(c_int) :: p_cur, n
99 end subroutine cuda_fusedcg_update_x
100 end interface
101
102 interface
103 real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
104 p_cur, n) bind(c, name='cuda_fusedcg_part2')
105 use, intrinsic :: iso_c_binding
106 import c_rp
107 implicit none
108 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
109 real(c_rp) :: alpha
110 integer(c_int) :: n, p_cur
111 end function cuda_fusedcg_part2
112 end interface
113#elif HAVE_HIP
114 interface
115 subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
116 bind(c, name='hip_fusedcg_update_p')
117 use, intrinsic :: iso_c_binding
118 import c_rp
119 implicit none
120 type(c_ptr), value :: p_d, z_d, po_d
121 real(c_rp) :: beta
122 integer(c_int) :: n
123 end subroutine hip_fusedcg_update_p
124 end interface
125
126 interface
127 subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
128 bind(c, name='hip_fusedcg_update_x')
129 use, intrinsic :: iso_c_binding
130 implicit none
131 type(c_ptr), value :: x_d, p_d, alpha
132 integer(c_int) :: p_cur, n
133 end subroutine hip_fusedcg_update_x
134 end interface
135
136 interface
137 real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
138 p_cur, n) bind(c, name='hip_fusedcg_part2')
139 use, intrinsic :: iso_c_binding
140 import c_rp
141 implicit none
142 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
143 real(c_rp) :: alpha
144 integer(c_int) :: n, p_cur
145 end function hip_fusedcg_part2
146 end interface
147#endif
148
149contains
150
151 subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
152 type(c_ptr), value :: p_d, z_d, po_d
153 real(c_rp) :: beta
154 integer(c_int) :: n
155#ifdef HAVE_HIP
156 call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
157#elif HAVE_CUDA
158 call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
159#else
160 call neko_error('No device backend configured')
161#endif
162 end subroutine device_fusedcg_update_p
163
164 subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
165 type(c_ptr), value :: x_d, p_d, alpha
166 integer(c_int) :: p_cur, n
167#ifdef HAVE_HIP
168 call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
169#elif HAVE_CUDA
170 call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
171#else
172 call neko_error('No device backend configured')
173#endif
174 end subroutine device_fusedcg_update_x
175
176 function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
177 p_cur, n) result(res)
178 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
179 real(c_rp) :: alpha
180 integer :: n, p_cur
181 real(kind=rp) :: res
182 integer :: ierr
183#ifdef HAVE_HIP
184 res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
185#elif HAVE_CUDA
186 res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
187#else
188 call neko_error('No device backend configured')
189#endif
190
191#ifndef HAVE_DEVICE_MPI
192 if (pe_size .gt. 1) then
193 call mpi_allreduce(mpi_in_place, res, 1, &
194 mpi_real_precision, mpi_sum, neko_comm, ierr)
195 end if
196#endif
197
198 end function device_fusedcg_part2
199
201 subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, &
202 monitor)
203 class(fusedcg_device_t), target, intent(inout) :: this
204 class(pc_t), optional, intent(in), target :: M
205 integer, intent(in) :: n
206 integer, intent(in) :: max_iter
207 real(kind=rp), optional, intent(in) :: rel_tol
208 real(kind=rp), optional, intent(in) :: abs_tol
209 logical, optional, intent(in) :: monitor
210 type(c_ptr) :: ptr
211 integer(c_size_t) :: p_size
212 integer :: i
213
214 call this%free()
215
216 allocate(this%w(n))
217 allocate(this%r(n))
218 allocate(this%z(n))
219 allocate(this%p(n, device_fusedcg_p_space))
220 allocate(this%p_d(device_fusedcg_p_space))
221 allocate(this%alpha(device_fusedcg_p_space))
222
223 if (present(m)) then
224 this%M => m
225 end if
226
227 call device_map(this%w, this%w_d, n)
228 call device_map(this%r, this%r_d, n)
229 call device_map(this%z, this%z_d, n)
230 call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
231 do i = 1, device_fusedcg_p_space+1
232 this%p_d(i) = c_null_ptr
233 call device_map(this%p(:,i), this%p_d(i), n)
234 end do
235
236 p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
237 call device_alloc(this%p_d_d, p_size)
238 ptr = c_loc(this%p_d)
239 call device_memcpy(ptr, this%p_d_d, p_size, &
240 host_to_device, sync=.false.)
241 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
242 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
243 else if (present(rel_tol) .and. present(abs_tol)) then
244 call this%ksp_init(max_iter, rel_tol, abs_tol)
245 else if (present(monitor) .and. present(abs_tol)) then
246 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
247 else if (present(rel_tol) .and. present(monitor)) then
248 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
249 else if (present(rel_tol)) then
250 call this%ksp_init(max_iter, rel_tol = rel_tol)
251 else if (present(abs_tol)) then
252 call this%ksp_init(max_iter, abs_tol = abs_tol)
253 else if (present(monitor)) then
254 call this%ksp_init(max_iter, monitor = monitor)
255 else
256 call this%ksp_init(max_iter)
257 end if
258
259 call device_event_create(this%gs_event, 2)
260
261 end subroutine fusedcg_device_init
262
264 subroutine fusedcg_device_free(this)
265 class(fusedcg_device_t), intent(inout) :: this
266 integer :: i
267
268 call this%ksp_free()
269
270 if (allocated(this%w)) then
271 deallocate(this%w)
272 end if
273
274 if (allocated(this%r)) then
275 deallocate(this%r)
276 end if
277
278
279 if (allocated(this%z)) then
280 deallocate(this%z)
281 end if
282
283
284 if (allocated(this%alpha)) then
285 deallocate(this%alpha)
286 end if
287
288 if (allocated(this%p)) then
289 deallocate(this%p)
290 end if
291
292 if (c_associated(this%w_d)) then
293 call device_free(this%w_d)
294 end if
295
296 if (c_associated(this%r_d)) then
297 call device_free(this%r_d)
298 end if
299
300 if (c_associated(this%z_d)) then
301 call device_free(this%z_d)
302 end if
303
304 if (c_associated(this%alpha_d)) then
305 call device_free(this%alpha_d)
306 end if
307
308 if (allocated(this%p_d)) then
309 do i = 1, device_fusedcg_p_space
310 if (c_associated(this%p_d(i))) then
311 call device_free(this%p_d(i))
312 end if
313 end do
314 end if
315
316 if (c_associated(this%p_d_d)) then
317 call device_free(this%p_d_d)
318 end if
319
320 nullify(this%M)
321
322 if (c_associated(this%gs_event)) then
323 call device_event_destroy(this%gs_event)
324 end if
325
326 end subroutine fusedcg_device_free
327
329 function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
330 class(fusedcg_device_t), intent(inout) :: this
331 class(ax_t), intent(in) :: ax
332 type(field_t), intent(inout) :: x
333 integer, intent(in) :: n
334 real(kind=rp), dimension(n), intent(in) :: f
335 type(coef_t), intent(inout) :: coef
336 type(bc_list_t), intent(inout) :: blst
337 type(gs_t), intent(inout) :: gs_h
338 type(ksp_monitor_t) :: ksp_results
339 integer, optional, intent(in) :: niter
340 integer :: iter, max_iter, ierr, i, p_cur, p_prev
341 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
342 real(kind=rp) :: pap, beta
343 type(c_ptr) :: f_d
344 f_d = device_get_ptr(f)
345
346 if (present(niter)) then
347 max_iter = niter
348 else
349 max_iter = ksp_max_iter
350 end if
351 norm_fac = 1.0_rp / sqrt(coef%volume)
352
353 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
354 alpha => this%alpha, alpha_d => this%alpha_d, &
355 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
356 p_d => this%p_d, p_d_d => this%p_d_d)
357
358 rtz1 = 1.0_rp
360 p_cur = 1
361 call device_rzero(x%x_d, n)
362 call device_rzero(p_d(1), n)
363 call device_copy(r_d, f_d, n)
364
365 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
366 rnorm = sqrt(rtr)*norm_fac
367 ksp_results%res_start = rnorm
368 ksp_results%res_final = rnorm
369 ksp_results%iter = 0
370 if(abscmp(rnorm, 0.0_rp)) then
371 ksp_results%converged = .true.
372 return
373 end if
374
375 call this%monitor_start('FusedCG')
376 do iter = 1, max_iter
377 call this%M%solve(z, r, n)
378 rtz2 = rtz1
379 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
380 beta = rtz1 / rtz2
381 if (iter .eq. 1) beta = 0.0_rp
382 call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
383
384 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
385 call gs_h%op(w, n, gs_op_add, this%gs_event)
386 call device_event_sync(this%gs_event)
387 call blst%apply(w, n)
388
389 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
390
391 alpha(p_cur) = rtz1 / pap
392 rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
393 alpha_d, alpha(p_cur), p_cur, n)
394 rnorm = sqrt(rtr)*norm_fac
395 call this%monitor_iter(iter, rnorm)
396 if ((p_cur .eq. device_fusedcg_p_space) .or. &
397 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
398 call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
399 p_prev = p_cur
400 p_cur = 1
401 if (rnorm .lt. this%abs_tol) exit
402 else
403 p_prev = p_cur
404 p_cur = p_cur + 1
405 end if
406 end do
407 call this%monitor_stop()
408 ksp_results%res_final = rnorm
409 ksp_results%iter = iter
410 ksp_results%converged = this%is_converged(iter, rnorm)
411
412 end associate
413
414 end function fusedcg_device_solve
415
417 function fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
418 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
419 class(fusedcg_device_t), intent(inout) :: this
420 class(ax_t), intent(in) :: ax
421 type(field_t), intent(inout) :: x
422 type(field_t), intent(inout) :: y
423 type(field_t), intent(inout) :: z
424 integer, intent(in) :: n
425 real(kind=rp), dimension(n), intent(in) :: fx
426 real(kind=rp), dimension(n), intent(in) :: fy
427 real(kind=rp), dimension(n), intent(in) :: fz
428 type(coef_t), intent(inout) :: coef
429 type(bc_list_t), intent(inout) :: blstx
430 type(bc_list_t), intent(inout) :: blsty
431 type(bc_list_t), intent(inout) :: blstz
432 type(gs_t), intent(inout) :: gs_h
433 type(ksp_monitor_t), dimension(3) :: ksp_results
434 integer, optional, intent(in) :: niter
435
436 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
437 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
438 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
439
441
442end module fusedcg_device
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Return the device pointer for an associated Fortran array.
Definition device.F90:101
Map a Fortran array to a device (allocate and associate)
Definition device.F90:77
Copy data between host and device (or device and device)
Definition device.F90:71
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Coefficients.
Definition coef.f90:34
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:51
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:43
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition device.F90:1314
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1279
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition device.F90:192
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
Defines a field.
Definition field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition krylov.f90:51
Definition math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition math.f90:1067
subroutine, public copy(a, b, n)
Copy a vector .
Definition math.f90:249
subroutine, public rzero(a, n)
Zero a real vector.
Definition math.f90:205
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Utilities.
Definition utils.f90:35
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:48
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:55
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:73
Defines a canonical Krylov preconditioner.
Definition precon.f90:40