Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
fusedcg_device.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2024, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
36 use precon, only : pc_t
37 use ax_product, only : ax_t
38 use num_types, only: rp, c_rp
39 use field, only : field_t
40 use coefs, only : coef_t
41 use gather_scatter, only : gs_t, gs_op_add
42 use bc_list, only : bc_list_t
43 use math, only : glsc3, rzero, copy, abscmp
45 use device
46 use comm
47 implicit none
48 private
49
50 integer, parameter :: device_fusedcg_p_space = 10
51
53 type, public, extends(ksp_t) :: fusedcg_device_t
54 real(kind=rp), allocatable :: w(:)
55 real(kind=rp), allocatable :: r(:)
56 real(kind=rp), allocatable :: z(:)
57 real(kind=rp), allocatable :: p(:,:)
58 real(kind=rp), allocatable :: alpha(:)
59 type(c_ptr) :: w_d = c_null_ptr
60 type(c_ptr) :: r_d = c_null_ptr
61 type(c_ptr) :: z_d = c_null_ptr
62 type(c_ptr) :: alpha_d = c_null_ptr
63 type(c_ptr) :: p_d_d = c_null_ptr
64 type(c_ptr), allocatable :: p_d(:)
65 type(c_ptr) :: gs_event = c_null_ptr
66 contains
67 procedure, pass(this) :: init => fusedcg_device_init
68 procedure, pass(this) :: free => fusedcg_device_free
69 procedure, pass(this) :: solve => fusedcg_device_solve
70 procedure, pass(this) :: solve_coupled => fusedcg_device_solve_coupled
71 end type fusedcg_device_t
72
73#ifdef HAVE_CUDA
74 interface
75 subroutine cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
76 bind(c, name='cuda_fusedcg_update_p')
77 use, intrinsic :: iso_c_binding
78 import c_rp
79 implicit none
80 type(c_ptr), value :: p_d, z_d, po_d
81 real(c_rp) :: beta
82 integer(c_int) :: n
83 end subroutine cuda_fusedcg_update_p
84 end interface
85
86 interface
87 subroutine cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
88 bind(c, name='cuda_fusedcg_update_x')
89 use, intrinsic :: iso_c_binding
90 implicit none
91 type(c_ptr), value :: x_d, p_d, alpha
92 integer(c_int) :: p_cur, n
93 end subroutine cuda_fusedcg_update_x
94 end interface
95
96 interface
97 real(c_rp) function cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
98 p_cur, n) bind(c, name='cuda_fusedcg_part2')
99 use, intrinsic :: iso_c_binding
100 import c_rp
101 implicit none
102 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
103 real(c_rp) :: alpha
104 integer(c_int) :: n, p_cur
105 end function cuda_fusedcg_part2
106 end interface
107#elif HAVE_HIP
108 interface
109 subroutine hip_fusedcg_update_p(p_d, z_d, po_d, beta, n) &
110 bind(c, name='hip_fusedcg_update_p')
111 use, intrinsic :: iso_c_binding
112 import c_rp
113 implicit none
114 type(c_ptr), value :: p_d, z_d, po_d
115 real(c_rp) :: beta
116 integer(c_int) :: n
117 end subroutine hip_fusedcg_update_p
118 end interface
119
120 interface
121 subroutine hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n) &
122 bind(c, name='hip_fusedcg_update_x')
123 use, intrinsic :: iso_c_binding
124 implicit none
125 type(c_ptr), value :: x_d, p_d, alpha
126 integer(c_int) :: p_cur, n
127 end subroutine hip_fusedcg_update_x
128 end interface
129
130 interface
131 real(c_rp) function hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
132 p_cur, n) bind(c, name='hip_fusedcg_part2')
133 use, intrinsic :: iso_c_binding
134 import c_rp
135 implicit none
136 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
137 real(c_rp) :: alpha
138 integer(c_int) :: n, p_cur
139 end function hip_fusedcg_part2
140 end interface
141#endif
142
143contains
144
145 subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
146 type(c_ptr), value :: p_d, z_d, po_d
147 real(c_rp) :: beta
148 integer(c_int) :: n
149#ifdef HAVE_HIP
150 call hip_fusedcg_update_p(p_d, z_d, po_d, beta, n)
151#elif HAVE_CUDA
152 call cuda_fusedcg_update_p(p_d, z_d, po_d, beta, n)
153#else
154 call neko_error('No device backend configured')
155#endif
156 end subroutine device_fusedcg_update_p
157
158 subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
159 type(c_ptr), value :: x_d, p_d, alpha
160 integer(c_int) :: p_cur, n
161#ifdef HAVE_HIP
162 call hip_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
163#elif HAVE_CUDA
164 call cuda_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
165#else
166 call neko_error('No device backend configured')
167#endif
168 end subroutine device_fusedcg_update_x
169
170 function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, &
171 p_cur, n) result(res)
172 type(c_ptr), value :: a_d, b_d, c_d, alpha_d
173 real(c_rp) :: alpha
174 integer :: n, p_cur
175 real(kind=rp) :: res
176 integer :: ierr
177#ifdef HAVE_HIP
178 res = hip_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
179#elif HAVE_CUDA
180 res = cuda_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
181#else
182 call neko_error('No device backend configured')
183#endif
184
185#ifndef HAVE_DEVICE_MPI
186 if (pe_size .gt. 1) then
187 call mpi_allreduce(mpi_in_place, res, 1, &
188 mpi_real_precision, mpi_sum, neko_comm, ierr)
189 end if
190#endif
191
192 end function device_fusedcg_part2
193
195 subroutine fusedcg_device_init(this, n, max_iter, M, rel_tol, abs_tol, &
196 monitor)
197 class(fusedcg_device_t), target, intent(inout) :: this
198 class(pc_t), optional, intent(in), target :: M
199 integer, intent(in) :: n
200 integer, intent(in) :: max_iter
201 real(kind=rp), optional, intent(in) :: rel_tol
202 real(kind=rp), optional, intent(in) :: abs_tol
203 logical, optional, intent(in) :: monitor
204 type(c_ptr) :: ptr
205 integer(c_size_t) :: p_size
206 integer :: i
207
208 call this%free()
209
210 allocate(this%w(n))
211 allocate(this%r(n))
212 allocate(this%z(n))
213 allocate(this%p(n, device_fusedcg_p_space))
214 allocate(this%p_d(device_fusedcg_p_space))
215 allocate(this%alpha(device_fusedcg_p_space))
216
217 if (present(m)) then
218 this%M => m
219 end if
220
221 call device_map(this%w, this%w_d, n)
222 call device_map(this%r, this%r_d, n)
223 call device_map(this%z, this%z_d, n)
224 call device_map(this%alpha, this%alpha_d, device_fusedcg_p_space)
225 do i = 1, device_fusedcg_p_space+1
226 this%p_d(i) = c_null_ptr
227 call device_map(this%p(:,i), this%p_d(i), n)
228 end do
229
230 p_size = c_sizeof(c_null_ptr) * (device_fusedcg_p_space)
231 call device_alloc(this%p_d_d, p_size)
232 ptr = c_loc(this%p_d)
233 call device_memcpy(ptr, this%p_d_d, p_size, &
234 host_to_device, sync=.false.)
235 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
236 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
237 else if (present(rel_tol) .and. present(abs_tol)) then
238 call this%ksp_init(max_iter, rel_tol, abs_tol)
239 else if (present(monitor) .and. present(abs_tol)) then
240 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
241 else if (present(rel_tol) .and. present(monitor)) then
242 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
243 else if (present(rel_tol)) then
244 call this%ksp_init(max_iter, rel_tol = rel_tol)
245 else if (present(abs_tol)) then
246 call this%ksp_init(max_iter, abs_tol = abs_tol)
247 else if (present(monitor)) then
248 call this%ksp_init(max_iter, monitor = monitor)
249 else
250 call this%ksp_init(max_iter)
251 end if
252
253 call device_event_create(this%gs_event, 2)
254
255 end subroutine fusedcg_device_init
256
258 subroutine fusedcg_device_free(this)
259 class(fusedcg_device_t), intent(inout) :: this
260 integer :: i
261
262 call this%ksp_free()
263
264 if (allocated(this%w)) then
265 deallocate(this%w)
266 end if
267
268 if (allocated(this%r)) then
269 deallocate(this%r)
270 end if
271
272
273 if (allocated(this%z)) then
274 deallocate(this%z)
275 end if
276
277
278 if (allocated(this%alpha)) then
279 deallocate(this%alpha)
280 end if
281
282 if (allocated(this%p)) then
283 deallocate(this%p)
284 end if
285
286 if (c_associated(this%w_d)) then
287 call device_free(this%w_d)
288 end if
289
290 if (c_associated(this%r_d)) then
291 call device_free(this%r_d)
292 end if
293
294 if (c_associated(this%z_d)) then
295 call device_free(this%z_d)
296 end if
297
298 if (c_associated(this%alpha_d)) then
299 call device_free(this%alpha_d)
300 end if
301
302 if (allocated(this%p_d)) then
303 do i = 1, device_fusedcg_p_space
304 if (c_associated(this%p_d(i))) then
305 call device_free(this%p_d(i))
306 end if
307 end do
308 end if
309
310 nullify(this%M)
311
312 if (c_associated(this%gs_event)) then
313 call device_event_destroy(this%gs_event)
314 end if
315
316 end subroutine fusedcg_device_free
317
319 function fusedcg_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) result(ksp_results)
320 class(fusedcg_device_t), intent(inout) :: this
321 class(ax_t), intent(in) :: ax
322 type(field_t), intent(inout) :: x
323 integer, intent(in) :: n
324 real(kind=rp), dimension(n), intent(in) :: f
325 type(coef_t), intent(inout) :: coef
326 type(bc_list_t), intent(inout) :: blst
327 type(gs_t), intent(inout) :: gs_h
328 type(ksp_monitor_t) :: ksp_results
329 integer, optional, intent(in) :: niter
330 integer :: iter, max_iter, ierr, i, p_cur, p_prev
331 real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
332 real(kind=rp) :: pap, beta
333 type(c_ptr) :: f_d
334 f_d = device_get_ptr(f)
335
336 if (present(niter)) then
337 max_iter = niter
338 else
339 max_iter = ksp_max_iter
340 end if
341 norm_fac = 1.0_rp / sqrt(coef%volume)
342
343 associate(w => this%w, r => this%r, p => this%p, z => this%z, &
344 alpha => this%alpha, alpha_d => this%alpha_d, &
345 w_d => this%w_d, r_d => this%r_d, z_d => this%z_d, &
346 p_d => this%p_d, p_d_d => this%p_d_d)
347
348 rtz1 = 1.0_rp
350 p_cur = 1
351 call device_rzero(x%x_d, n)
352 call device_rzero(p_d(1), n)
353 call device_copy(r_d, f_d, n)
354
355 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
356 rnorm = sqrt(rtr)*norm_fac
357 ksp_results%res_start = rnorm
358 ksp_results%res_final = rnorm
359 ksp_results%iter = 0
360 if(abscmp(rnorm, 0.0_rp)) return
361
362 call this%monitor_start('FusedCG')
363 do iter = 1, max_iter
364 call this%M%solve(z, r, n)
365 rtz2 = rtz1
366 rtz1 = device_glsc3(r_d, coef%mult_d, z_d, n)
367 beta = rtz1 / rtz2
368 if (iter .eq. 1) beta = 0.0_rp
369 call device_fusedcg_update_p(p_d(p_cur), z_d, p_d(p_prev), beta, n)
370
371 call ax%compute(w, p(1, p_cur), coef, x%msh, x%Xh)
372 call gs_h%op(w, n, gs_op_add, this%gs_event)
373 call device_event_sync(this%gs_event)
374 call blst%apply(w, n)
375
376 pap = device_glsc3(w_d, coef%mult_d, this%p_d(p_cur), n)
377
378 alpha(p_cur) = rtz1 / pap
379 rtr = device_fusedcg_part2(r_d, coef%mult_d, w_d, &
380 alpha_d, alpha(p_cur), p_cur, n)
381 rnorm = sqrt(rtr)*norm_fac
382 call this%monitor_iter(iter, rnorm)
383 if ((p_cur .eq. device_fusedcg_p_space) .or. &
384 (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
385 call device_fusedcg_update_x(x%x_d, p_d_d, alpha_d, p_cur, n)
386 p_prev = p_cur
387 p_cur = 1
388 if (rnorm .lt. this%abs_tol) exit
389 else
390 p_prev = p_cur
391 p_cur = p_cur + 1
392 end if
393 end do
394 call this%monitor_stop()
395 ksp_results%res_final = rnorm
396 ksp_results%iter = iter
397 ksp_results%converged = this%is_converged(iter, rnorm)
398
399 end associate
400
401 end function fusedcg_device_solve
402
404 function fusedcg_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
405 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
406 class(fusedcg_device_t), intent(inout) :: this
407 class(ax_t), intent(in) :: ax
408 type(field_t), intent(inout) :: x
409 type(field_t), intent(inout) :: y
410 type(field_t), intent(inout) :: z
411 integer, intent(in) :: n
412 real(kind=rp), dimension(n), intent(in) :: fx
413 real(kind=rp), dimension(n), intent(in) :: fy
414 real(kind=rp), dimension(n), intent(in) :: fz
415 type(coef_t), intent(inout) :: coef
416 type(bc_list_t), intent(inout) :: blstx
417 type(bc_list_t), intent(inout) :: blsty
418 type(bc_list_t), intent(inout) :: blstz
419 type(gs_t), intent(inout) :: gs_h
420 type(ksp_monitor_t), dimension(3) :: ksp_results
421 integer, optional, intent(in) :: niter
422
423 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
424 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
425 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
426
428
429end module fusedcg_device
430
431
void hip_fusedcg_update_x(void *x, void *p, void *alpha, int *p_cur, int *n)
real hip_fusedcg_part2(void *a, void *b, void *c, void *alpha_d, real *alpha, int *p_cur, int *n)
void hip_fusedcg_update_p(void *p, void *z, void *po, real *beta, int *n)
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Coefficients.
Definition coef.f90:34
Definition comm.F90:1
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
Defines a field.
Definition field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine fusedcg_device_free(this)
Deallocate a pipelined PCG solver.
real(kind=rp) function device_fusedcg_part2(a_d, b_d, c_d, alpha_d, alpha, p_cur, n)
type(ksp_monitor_t) function, dimension(3) fusedcg_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
integer, parameter device_fusedcg_p_space
subroutine device_fusedcg_update_x(x_d, p_d, alpha, p_cur, n)
subroutine fusedcg_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
type(ksp_monitor_t) function fusedcg_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
subroutine device_fusedcg_update_p(p_d, z_d, po_d, beta, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition krylov.f90:51
Definition math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition math.f90:894
subroutine, public copy(a, b, n)
Copy a vector .
Definition math.f90:238
subroutine, public rzero(a, n)
Zero a real vector.
Definition math.f90:194
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:46
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:55
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:68
Defines a canonical Krylov preconditioner.
Definition precon.f90:40