Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
tree_amg_smoother.f90
Go to the documentation of this file.
1! Copyright (c) 2024-2026, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use tree_amg, only : tamg_hierarchy_t
37 use num_types, only : rp
38 use math, only : col2, add2, add2s2, glsc2, glsc3, sub2, cmult, &
43 use krylov, only : ksp_monitor_t
44 use bc_list, only: bc_list_t
45 use gather_scatter, only : gs_t, gs_op_add
46 use logger, only : neko_log, log_size
51 use, intrinsic :: iso_c_binding
52 implicit none
53 private
54
56 type, public :: amg_jacobi_t
57 real(kind=rp), allocatable :: d(:)
58 real(kind=rp), allocatable :: w(:)
59 real(kind=rp), allocatable :: r(:)
60 real(kind=rp) :: omega
61 integer :: lvl
62 integer :: n
63 integer :: max_iter = 10
64 logical :: recompute_diag = .true.
65 contains
66 procedure, pass(this) :: init => amg_jacobi_init
67 procedure, pass(this) :: solve => amg_jacobi_solve
68 procedure, pass(this) :: comp_diag => amg_jacobi_diag
69 procedure, pass(this) :: free => amg_jacobi_free
70 end type amg_jacobi_t
71
73 type, public :: amg_cheby_t
74 real(kind=rp), allocatable :: d(:)
75 type(c_ptr) :: d_d = c_null_ptr
76 real(kind=rp), allocatable :: w(:)
77 type(c_ptr) :: w_d = c_null_ptr
78 real(kind=rp), allocatable :: r(:)
79 type(c_ptr) :: r_d = c_null_ptr
80 real(kind=rp) :: tha, dlt
81 integer :: lvl
82 integer :: n
83 integer :: power_its = 250
84 integer :: max_iter = 10
85 logical :: recompute_eigs = .true.
86 contains
87 procedure, pass(this) :: init => amg_cheby_init
88 procedure, pass(this) :: solve => amg_cheby_solve
89 procedure, pass(this) :: comp_eig => amg_cheby_power
90 procedure, pass(this) :: device_solve => amg_device_cheby_solve
91 procedure, pass(this) :: device_comp_eig => amg_device_cheby_power
92 procedure, pass(this) :: free => amg_cheby_free
93 end type amg_cheby_t
94
95contains
96
101 subroutine amg_cheby_init(this, n, lvl, max_iter)
102 class(amg_cheby_t), intent(inout), target :: this
103 integer, intent(in) :: n
104 integer, intent(in) :: lvl
105 integer, intent(in) :: max_iter
106
107 allocate(this%d(n))
108 allocate(this%w(n))
109 allocate(this%r(n))
110 if (neko_bcknd_device .eq. 1) then
111 call device_map(this%d, this%d_d, n)
112 call device_map(this%w, this%w_d, n)
113 call device_map(this%r, this%r_d, n)
114 end if
115 this%n = n
116 this%lvl = lvl
117 this%max_iter = max_iter
118 this%recompute_eigs = .true.
119
120 call amg_smoo_monitor(lvl, this)
121
122 end subroutine amg_cheby_init
123
125 subroutine amg_cheby_free(this)
126 class(amg_cheby_t), intent(inout), target :: this
127 if (allocated(this%d)) then
128 if (neko_bcknd_device .eq. 1 .and. c_associated(this%d_d)) then
129 call device_unmap(this%d, this%d_d)
130 end if
131 deallocate(this%d)
132 end if
133 if (allocated(this%w)) then
134 if (neko_bcknd_device .eq. 1 .and. c_associated(this%w_d)) then
135 call device_unmap(this%w, this%w_d)
136 end if
137 deallocate(this%w)
138 end if
139 if (allocated(this%r)) then
140 if (neko_bcknd_device .eq. 1 .and. c_associated(this%r_d)) then
141 call device_unmap(this%r, this%r_d)
142 end if
143 deallocate(this%r)
144 end if
145 end subroutine amg_cheby_free
146
147
151 subroutine amg_cheby_power(this, amg, n)
152 class(amg_cheby_t), intent(inout) :: this
153 type(tamg_hierarchy_t), intent(inout) :: amg
154 integer, intent(in) :: n
155 real(kind=rp) :: lam, b, a, rn
156 real(kind=rp), parameter :: boost = 1.1_rp
157 real(kind=rp), parameter :: lam_factor = 30.0_rp
158 real(kind=rp) :: wtw, dtw, dtd
159 integer :: i
160 associate(w => this%w, d => this%d, coef => amg%coef, gs_h => amg%gs_h, &
161 msh => amg%msh, xh => amg%Xh, blst => amg%blst)
162
163 do i = 1, n
164 !call random_number(rn)
165 !d(i) = rn + 10.0_rp
166 d(i) = sin(real(i))
167 end do
168 if (this%lvl .eq. 0) then
169 call gs_h%op(d, n, gs_op_add)!TODO
170 call blst%apply(d, n)
171 end if
172 !Power method to get lamba max
173 do i = 1, this%power_its
174 call amg%matvec(w, d, this%lvl)
175
176 if (this%lvl .eq. 0) then
177 wtw = glsc3(w, coef%mult, w, n)
178 else
179 wtw = glsc2(w, w, n)
180 end if
181
182 call cmult2(d, w, 1.0_rp/sqrt(wtw), n)
183 end do
184
185 call amg%matvec(w, d, this%lvl)
186
187 if (this%lvl .eq. 0) then
188 dtw = glsc3(d, coef%mult, w, n)
189 dtd = glsc3(d, coef%mult, d, n)
190 else
191 dtw = glsc2(d, w, n)
192 dtd = glsc2(d, d, n)
193 end if
194 lam = dtw / dtd
195 b = lam * boost
196 a = lam / lam_factor
197 this%tha = (b+a)/2.0_rp
198 this%dlt = (b-a)/2.0_rp
199
200 this%recompute_eigs = .false.
201 call amg_cheby_monitor(this%lvl, lam)
202 end associate
203 end subroutine amg_cheby_power
204
211 subroutine amg_cheby_solve(this, x, f, n, amg, zero_init)
212 class(amg_cheby_t), intent(inout) :: this
213 integer, intent(in) :: n
214 real(kind=rp), dimension(n), intent(inout) :: x
215 real(kind=rp), dimension(n), intent(inout) :: f
216 class(tamg_hierarchy_t), intent(inout) :: amg
217 type(ksp_monitor_t) :: ksp_results
218 logical, optional, intent(in) :: zero_init
219 integer :: iter, max_iter, i
220 real(kind=rp) :: rtr, rnorm
221 real(kind=rp) :: rhok, rhokp1, s1, thet, delt, tmp1, tmp2
222 logical :: zero_initial_guess
223
224 if (this%recompute_eigs) then
225 call this%comp_eig(amg, n)
226 end if
227 if (present(zero_init)) then
228 zero_initial_guess = zero_init
229 else
230 zero_initial_guess = .false.
231 end if
232 max_iter = this%max_iter
233
234 associate( w => this%w, r => this%r, d => this%d, blst => amg%blst)
235 call copy(r, f, n)
236 if (.not. zero_initial_guess) then
237 call amg%matvec(w, x, this%lvl)
238 call sub2(r, w, n)
239 end if
240
241 thet = this%tha
242 delt = this%dlt
243 s1 = thet / delt
244 rhok = 1.0_rp / s1
245
246 ! First iteration
247 !OCL NORECURRENCE, NOVREC, NOALIAS
248 !DIR$ CONCURRENT
249 !GCC$ ivdep
250 !$omp parallel do
251 do i = 1, n
252 d(i) = 1.0_rp/thet * r(i)
253 x(i) = x(i) + d(i)
254 end do
255 !$omp end parallel do
256
257 ! Rest of iterations
258 do iter = 2, max_iter
259 call amg%matvec(w, d, this%lvl)
260
261 rhokp1 = 1.0_rp / (2.0_rp * s1 - rhok)
262 tmp1 = rhokp1 * rhok
263 tmp2 = 2.0_rp * rhokp1 / delt
264 rhok = rhokp1
265
266 !$omp parallel private(i)
267 !OCL NORECURRENCE, NOVREC, NOALIAS
268 !DIR$ CONCURRENT
269 !GCC$ ivdep
270 !$omp do
271 do i = 1, n
272 r(i) = r(i) - w(i)
273 d(i) = tmp1 * d(i) + tmp2 * r(i)
274 x(i) = x(i) + d(i)
275 end do
276 !$omp end do
277 !$omp end parallel
278 end do
279 end associate
280 end subroutine amg_cheby_solve
281
285 subroutine amg_device_cheby_power(this, amg, n)
286 class(amg_cheby_t), intent(inout) :: this
287 type(tamg_hierarchy_t), intent(inout) :: amg
288 integer, intent(in) :: n
289 real(kind=rp) :: lam, b, a, rn
290 real(kind=rp), parameter :: boost = 1.1_rp
291 real(kind=rp), parameter :: lam_factor = 30.0_rp
292 real(kind=rp) :: wtw, dtw, dtd
293 integer :: i
294 associate(w => this%w, d => this%d, coef => amg%coef, gs_h => amg%gs_h, &
295 msh => amg%msh, xh => amg%Xh, blst => amg%blst)
296 do i = 1, n
297 !TODO: replace with a better way to initialize power method
298 d(i) = sin(real(i))
299 end do
300 call device_memcpy(this%d, this%d_d, n, host_to_device, .true.)
301 if (this%lvl .eq. 0) then
302 call gs_h%op(d, n, gs_op_add)!TODO
303 call blst%apply(d, n)
304 end if
305 do i = 1, this%power_its
306 call amg%device_matvec(w, d, this%w_d, this%d_d, this%lvl)
307
308 if (this%lvl .eq. 0) then
309 wtw = device_glsc3(this%w_d, coef%mult_d, this%w_d, n)
310 else
311 wtw = device_glsc2(this%w_d, this%w_d, n)
312 end if
313
314 call device_cmult2(this%d_d, this%w_d, 1.0_rp/sqrt(wtw), n)
315 end do
316
317 call amg%device_matvec(w, d, this%w_d, this%d_d, this%lvl)
318
319 if (this%lvl .eq. 0) then
320 dtw = device_glsc3(this%d_d, coef%mult_d, this%w_d, n)
321 dtd = device_glsc3(this%d_d, coef%mult_d, this%d_d, n)
322 else
323 dtw = device_glsc2(this%d_d, this%w_d, n)
324 dtd = device_glsc2(this%d_d, this%d_d, n)
325 end if
326 lam = dtw / dtd
327 b = lam * boost
328 a = lam / lam_factor
329 this%tha = (b+a)/2.0_rp
330 this%dlt = (b-a)/2.0_rp
331
332 this%recompute_eigs = .false.
333 call amg_cheby_monitor(this%lvl, lam)
334 end associate
335 end subroutine amg_device_cheby_power
336
343 subroutine amg_device_cheby_solve(this, x, f, x_d, f_d, n, amg, zero_init)
344 class(amg_cheby_t), intent(inout) :: this
345 integer, intent(in) :: n
346 real(kind=rp), dimension(n), intent(inout) :: x
347 real(kind=rp), dimension(n), intent(inout) :: f
348 type(c_ptr) :: x_d
349 type(c_ptr) :: f_d
350 class(tamg_hierarchy_t), intent(inout) :: amg
351 type(ksp_monitor_t) :: ksp_results
352 logical, optional, intent(in) :: zero_init
353 integer :: iter, max_iter
354 real(kind=rp) :: rtr, rnorm
355 real(kind=rp) :: rhok, rhokp1, s1, thet, delt, tmp1, tmp2
356 logical :: zero_initial_guess
357
358 if (this%recompute_eigs) then
359 call this%device_comp_eig(amg, n)
360 end if
361 if (present(zero_init)) then
362 zero_initial_guess = zero_init
363 else
364 zero_initial_guess = .false.
365 end if
366 max_iter = this%max_iter
367
368 associate( w_d => this%w_d, r_d => this%r_d, d_d => this%d_d, &
369 blst => amg%blst)
370
371 if (.not. zero_initial_guess) then
372 call amg%device_matvec(this%w, x, w_d, x_d, this%lvl)
373 end if
374
375 thet = this%tha
376 delt = this%dlt
377 s1 = thet / delt
378 rhok = 1.0_rp / s1
379
380 ! First iteration
381 tmp1 = 1.0_rp / thet
382 call amg_device_cheby_solve_part1(r_d, f_d, w_d, x_d, d_d, &
383 tmp1, n, zero_initial_guess)
384 ! Rest of iterations
385 do iter = 2, max_iter
386 call amg%device_matvec(this%w, this%d, w_d, d_d, this%lvl)
387
388 rhokp1 = 1.0_rp / (2.0_rp * s1 - rhok)
389 tmp1 = rhokp1 * rhok
390 tmp2 = 2.0_rp * rhokp1 / delt
391 rhok = rhokp1
392
393 call amg_device_cheby_solve_part2(r_d, w_d, d_d, x_d, tmp1, tmp2, n)
394
395 end do
396 end associate
397 end subroutine amg_device_cheby_solve
398
403 subroutine amg_jacobi_init(this, n, lvl, max_iter)
404 class(amg_jacobi_t), intent(inout), target :: this
405 integer, intent(in) :: n
406 integer, intent(in) :: lvl
407 integer, intent(in) :: max_iter
408
409 allocate(this%d(n))
410 allocate(this%w(n))
411 allocate(this%r(n))
412 this%n = n
413 this%lvl = lvl
414 this%max_iter = max_iter
415 this%omega = 0.7_rp
416
417 end subroutine amg_jacobi_init
418
420 subroutine amg_jacobi_free(this)
421 class(amg_jacobi_t), intent(inout), target :: this
422 if (allocated(this%d)) then
423 deallocate(this%d)
424 end if
425 if (allocated(this%w)) then
426 deallocate(this%w)
427 end if
428 if (allocated(this%r)) then
429 deallocate(this%r)
430 end if
431 end subroutine amg_jacobi_free
432
436 subroutine amg_jacobi_diag(this, amg, n)
437 class(amg_jacobi_t), intent(inout) :: this
438 type(tamg_hierarchy_t), intent(inout) :: amg
439 integer, intent(in) :: n
440 real(kind=rp) :: val
441 integer :: i
442 do i = 1, n
443 call tamg_sample_matrix_val(val, amg, this%lvl, i, i)
444 this%d(i) = 1.0_rp / val
445 end do
446 this%recompute_diag = .false.
447 end subroutine amg_jacobi_diag
448
454 subroutine amg_jacobi_solve(this, x, f, n, amg, niter)
455 class(amg_jacobi_t), intent(inout) :: this
456 integer, intent(in) :: n
457 real(kind=rp), dimension(n), intent(inout) :: x
458 real(kind=rp), dimension(n), intent(inout) :: f
459 class(tamg_hierarchy_t), intent(inout) :: amg
460 type(ksp_monitor_t) :: ksp_results
461 integer, optional, intent(in) :: niter
462 integer :: iter, max_iter
463 real(kind=rp) :: rtr, rnorm
464 integer :: i
465
466 if (this%recompute_diag) then
467 call this%comp_diag(amg, n)
468 end if
469
470 if (present(niter)) then
471 max_iter = niter
472 else
473 max_iter = this%max_iter
474 end if
475
476 ! x = x + omega * Dinv( f - Ax )
477 associate( w => this%w, r => this%r, d => this%d)
478 do iter = 1, max_iter
479 w = 0.0_rp
481 call amg%matvec(w, x, this%lvl)
482 !$omp parallel private(i)
484 !$omp do
485 do i = 1, n
486 r(i) = f(i) - w(i)
487 end do
488 !$omp end do
490 !$omp do
491 do i = 1, n
492 r(i) = r(i) * d(i)
493 end do
494 !$omp end do
496 !$omp do
497 do i = 1, n
498 x(i) = x(i) + this%omega * r(i)
499 end do
500 !$omp end do
501 !$omp end parallel
502 end do
503 end associate
504 end subroutine amg_jacobi_solve
505
506 subroutine amg_smoo_monitor(lvl, smoo)
507 integer, intent(in) :: lvl
508 class(amg_cheby_t), intent(in) :: smoo
509 character(len=LOG_SIZE) :: log_buf
510
511 write(log_buf, '(A8,I2,A28)') '-- level', lvl, '-- init smoother: Chebyshev'
512 call neko_log%message(log_buf)
513 write(log_buf, '(A22,I6)') 'Iterations:', smoo%max_iter
514 call neko_log%message(log_buf)
515 end subroutine amg_smoo_monitor
516
517 subroutine amg_cheby_monitor(lvl, lam)
518 integer, intent(in) :: lvl
519 real(kind=rp), intent(in) :: lam
520 character(len=LOG_SIZE) :: log_buf
521
522 write(log_buf, '(A12,I2,A29,F12.3)') '-- AMG level', lvl, &
523 '-- Chebyshev approx. max eig', lam
524 call neko_log%message(log_buf)
525 end subroutine amg_cheby_monitor
526
527end module tree_amg_smoother
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
double real
Map a Fortran array to a device (allocate and associate)
Definition device.F90:78
Copy data between host and device (or device and device)
Definition device.F90:72
Unmap a Fortran array from a device (deassociate and free)
Definition device.F90:84
Defines a list of bc_t.
Definition bc_list.f90:34
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
Implements device kernels for use with TreeAMG smoothers.
subroutine, public amg_device_cheby_solve_part1(r_d, f_d, w_d, x_d, d_d, inv_thet, n, zero_initial)
subroutine, public amg_device_cheby_solve_part2(r_d, w_d, d_d, x_d, tmp1, tmp2, n)
Device abstraction, common interface for various accelerators.
Definition device.F90:34
integer, parameter, public host_to_device
Definition device.F90:48
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
Logging routines.
Definition log.f90:34
type(log_t), public neko_log
Global log stream.
Definition log.f90:80
integer, parameter, public log_size
Definition log.f90:46
Definition math.f90:60
subroutine, public cmult(a, c, n)
Multiplication by constant c .
Definition math.f90:502
subroutine, public cmult2(a, b, c, n)
Multiplication by constant c .
Definition math.f90:517
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition math.f90:1285
real(kind=rp) function, public glsc2(a, b, n)
Weighted inner product .
Definition math.f90:1264
subroutine, public add2(a, b, n)
Vector addition .
Definition math.f90:898
subroutine, public add3s2(a, b, c, c1, c2, n)
Returns .
Definition math.f90:1091
subroutine, public col2(a, b, n)
Vector multiplication .
Definition math.f90:1044
subroutine, public copy(a, b, n)
Copy a vector .
Definition math.f90:289
subroutine, public sub2(a, b, n)
Vector substraction .
Definition math.f90:946
subroutine, public add2s2(a, b, c1, n)
Vector addition with scalar multiplication (multiplication on second argument)
Definition math.f90:996
Build configurations.
integer, parameter neko_bcknd_device
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Implements smoothers for use with TreeAMG matrix vector product.
subroutine amg_cheby_free(this)
free cheby data
subroutine amg_jacobi_init(this, n, lvl, max_iter)
Initialization of Jacobi (this is expensive...)
subroutine amg_cheby_power(this, amg, n)
Power method to approximate largest eigenvalue.
subroutine amg_device_cheby_power(this, amg, n)
Power method to approximate largest eigenvalue.
subroutine amg_cheby_solve(this, x, f, n, amg, zero_init)
Chebyshev smoother From Saad's iterative methods textbook.
subroutine amg_cheby_init(this, n, lvl, max_iter)
Initialization of chebyshev.
subroutine amg_jacobi_free(this)
free jacobi data
subroutine amg_jacobi_diag(this, amg, n)
SAMPLE MATRIX DIAGONAL VALUES (DO NOT USE, EXPENSIVE)
subroutine amg_device_cheby_solve(this, x, f, x_d, f_d, n, amg, zero_init)
Chebyshev smoother From Saad's iterative methods textbook.
subroutine amg_jacobi_solve(this, x, f, n, amg, niter)
Jacobi smoother.
subroutine amg_smoo_monitor(lvl, smoo)
subroutine amg_cheby_monitor(lvl, lam)
Implements utilities for the TreeAMG hierarchy structure.
subroutine, public tamg_sample_matrix_val(val, amg, lvl, i, j)
Sample the values in a matix (expensive, use with caution)
Implements the base type for TreeAMG hierarchy structure.
Definition tree_amg.f90:34
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:49
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Type for a TreeAMG hierarchy.
Definition tree_amg.f90:87
Type for Chebyshev iteration using TreeAMG matvec.
Type for Chebyshev iteration using TreeAMG matvec.