Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
cheby_device.F90
Go to the documentation of this file.
1! Copyright (c) 2024-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use krylov, only : ksp_t, ksp_monitor_t
36 use precon, only : pc_t
37 use ax_product, only : ax_t
38 use num_types, only : rp, c_rp
39 use field, only : field_t
40 use coefs, only : coef_t
41 use mesh, only : mesh_t
42 use space, only : space_t
43 use gather_scatter, only : gs_t, gs_op_add
44 use bc_list, only : bc_list_t
45 use schwarz, only : schwarz_t
49 use device
50 use, intrinsic :: iso_c_binding, only : c_ptr, c_int, &
51 c_null_ptr, c_associated
52 implicit none
53 private
54
56 type, public, extends(ksp_t) :: cheby_device_t
57 real(kind=rp), allocatable :: d(:)
58 real(kind=rp), allocatable :: w(:)
59 real(kind=rp), allocatable :: r(:)
60 type(c_ptr) :: d_d = c_null_ptr
61 type(c_ptr) :: w_d = c_null_ptr
62 type(c_ptr) :: r_d = c_null_ptr
63 type(c_ptr) :: gs_event = c_null_ptr
64 real(kind=rp) :: tha, dlt
65 integer :: power_its = 150
66 logical :: recompute_eigs = .true.
67 logical :: zero_initial_guess = .false.
68 type(schwarz_t), pointer :: schwarz => null()
69 contains
70 procedure, pass(this) :: init => cheby_device_init
71 procedure, pass(this) :: free => cheby_device_free
72 procedure, pass(this) :: solve => cheby_device_impl
73 procedure, pass(this) :: solve_coupled => cheby_device_solve_coupled
74 end type cheby_device_t
75
76#ifdef HAVE_HIP
77 interface
78 subroutine hip_cheby_device_part1(d_d, x_d, inv_tha, n, strm) &
79 bind(c, name = 'hip_cheby_part1')
80 use, intrinsic :: iso_c_binding
81 import c_rp
82 implicit none
83 type(c_ptr), value :: d_d, x_d, strm
84 real(c_rp) :: inv_tha
85 integer(c_int) :: n
86 end subroutine hip_cheby_device_part1
87 end interface
88
89 interface
90 subroutine hip_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, strm) &
91 bind(c, name = 'hip_cheby_part2')
92 use, intrinsic :: iso_c_binding
93 import c_rp
94 implicit none
95 type(c_ptr), value :: d_d, w_d, x_d, strm
96 real(c_rp) :: tmp1, tmp2
97 integer(c_int) :: n
98 end subroutine hip_cheby_device_part2
99 end interface
100#elif HAVE_CUDA
101 interface
102 subroutine cuda_cheby_device_part1(d_d, x_d, inv_tha, n, strm) &
103 bind(c, name = 'cuda_cheby_part1')
104 use, intrinsic :: iso_c_binding
105 import c_rp
106 implicit none
107 type(c_ptr), value :: d_d, x_d, strm
108 real(c_rp) :: inv_tha
109 integer(c_int) :: n
110 end subroutine cuda_cheby_device_part1
111 end interface
112
113 interface
114 subroutine cuda_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, strm) &
115 bind(c, name = 'cuda_cheby_part2')
116 use, intrinsic :: iso_c_binding
117 import c_rp
118 implicit none
119 type(c_ptr), value :: d_d, w_d, x_d, strm
120 real(c_rp) :: tmp1, tmp2
121 integer(c_int) :: n
122 end subroutine cuda_cheby_device_part2
123 end interface
124#endif
125
126contains
127 subroutine cheby_device_part1(d_d, x_d, inv_tha, n)
128 type(c_ptr) :: d_d, x_d
129 real(c_rp) :: inv_tha
130 integer(c_int) :: n
131#ifdef HAVE_HIP
132 call hip_cheby_device_part1(d_d, x_d, inv_tha, n, glb_cmd_queue)
133#elif HAVE_CUDA
134 call cuda_cheby_device_part1(d_d, x_d, inv_tha, n, glb_cmd_queue)
135#else !Fallback to device_math for missing device kernels
136 call device_cmult( d_d, inv_tha, n)
137 call device_add2( x_d, d_d, n)
138#endif
139 end subroutine cheby_device_part1
140
141
142
143 subroutine cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n)
144 type(c_ptr) :: d_d, w_d, x_d
145 real(c_rp) :: tmp1, tmp2
146 integer(c_int) :: n
147#ifdef HAVE_HIP
148 call hip_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, glb_cmd_queue)
149#elif HAVE_CUDA
150 call cuda_cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n, glb_cmd_queue)
151#else !Fallback to device_math for missing device kernels
152 call device_cmult( d_d, tmp1, n)
153 call device_add2s2( d_d, w_d, tmp2, n)
154 call device_add2( x_d, d_d, n)
155#endif
156 end subroutine cheby_device_part2
157
159 subroutine cheby_device_init(this, n, max_iter, M, rel_tol, abs_tol, monitor)
160 class(cheby_device_t), intent(inout), target :: this
161 integer, intent(in) :: max_iter
162 class(pc_t), optional, intent(in), target :: M
163 integer, intent(in) :: n
164 real(kind=rp), optional, intent(in) :: rel_tol
165 real(kind=rp), optional, intent(in) :: abs_tol
166 logical, optional, intent(in) :: monitor
167
168 call this%free()
169 allocate(this%d(n))
170 allocate(this%w(n))
171 allocate(this%r(n))
172
173 call device_map(this%d, this%d_d, n)
174 call device_map(this%w, this%w_d, n)
175 call device_map(this%r, this%r_d, n)
176
177 if (present(m)) then
178 this%M => m
179 end if
180
181 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
182 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
183 else if (present(rel_tol) .and. present(abs_tol)) then
184 call this%ksp_init(max_iter, rel_tol, abs_tol)
185 else if (present(monitor) .and. present(abs_tol)) then
186 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
187 else if (present(rel_tol) .and. present(monitor)) then
188 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
189 else if (present(rel_tol)) then
190 call this%ksp_init(max_iter, rel_tol = rel_tol)
191 else if (present(abs_tol)) then
192 call this%ksp_init(max_iter, abs_tol = abs_tol)
193 else if (present(monitor)) then
194 call this%ksp_init(max_iter, monitor = monitor)
195 else
196 call this%ksp_init(max_iter)
197 end if
198
199 call device_event_create(this%gs_event, 2)
200
201 end subroutine cheby_device_init
202
203 subroutine cheby_device_free(this)
204 class(cheby_device_t), intent(inout) :: this
205
206 call this%ksp_free()
207
208 if (allocated(this%d)) then
209 deallocate(this%d)
210 end if
211
212 if (allocated(this%w)) then
213 deallocate(this%w)
214 end if
215
216 if (allocated(this%r)) then
217 deallocate(this%r)
218 end if
219
220 nullify(this%M)
221
222 if (c_associated(this%d_d)) then
223 call device_free(this%d_d)
224 end if
225
226 if (c_associated(this%w_d)) then
227 call device_free(this%w_d)
228 end if
229
230 if (c_associated(this%r_d)) then
231 call device_free(this%r_d)
232 end if
233
234 if (c_associated(this%gs_event)) then
235 call device_event_destroy(this%gs_event)
236 end if
237
238 end subroutine cheby_device_free
239
240 subroutine cheby_device_power(this, Ax, x, n, coef, blst, gs_h)
241 class(cheby_device_t), intent(inout) :: this
242 class(ax_t), intent(in) :: Ax
243 type(field_t), intent(inout) :: x
244 integer, intent(in) :: n
245 type(coef_t), intent(inout) :: coef
246 type(bc_list_t), intent(inout) :: blst
247 type(gs_t), intent(inout) :: gs_h
248 real(kind=rp) :: lam, b, a, rn
249 real(kind=rp) :: boost = 1.1_rp
250 real(kind=rp) :: lam_factor = 30.0_rp
251 real(kind=rp) :: wtw, dtw, dtd
252 integer :: i
253
254 associate(w => this%w, w_d => this%w_d, d => this%d, d_d => this%d_d)
255
256 do i = 1, n
257 !TODO: replace with a better way to initialize power method
258 call random_number(rn)
259 d(i) = rn + 10.0_rp
260 end do
261 call device_memcpy(d, d_d, n, host_to_device, sync = .true.)
262
263 call gs_h%op(d, n, gs_op_add, this%gs_event)
264 call blst%apply(d, n)
265
266 !Power method to get lamba max
267 do i = 1, this%power_its
268 call ax%compute(w, d, coef, x%msh, x%Xh)
269 call gs_h%op(w, n, gs_op_add, this%gs_event)
270 call blst%apply(w, n)
271 if (associated(this%schwarz)) then
272 call this%schwarz%compute(this%r, w)
273 call device_copy(w_d, this%r_d, n)
274 else
275 call this%M%solve(this%r, w, n)
276 call device_copy(w_d, this%r_d, n)
277 end if
278
279 wtw = device_glsc3(w_d, coef%mult_d, w_d, n)
280 call device_cmult2(d_d, w_d, 1.0_rp/sqrt(wtw), n)
281 call blst%apply(d, n)
282 end do
283
284 call ax%compute(w, d, coef, x%msh, x%Xh)
285 call gs_h%op(w, n, gs_op_add, this%gs_event)
286 call blst%apply(w, n)
287 if (associated(this%schwarz)) then
288 call this%schwarz%compute(this%r, w)
289 call device_copy(w_d, this%r_d, n)
290 else
291 call this%M%solve(this%r, w, n)
292 call device_copy(w_d, this%r_d, n)
293 end if
294
295 dtw = device_glsc3(d_d, coef%mult_d, w_d, n)
296 dtd = device_glsc3(d_d, coef%mult_d, d_d, n)
297 lam = dtw / dtd
298 b = lam * boost
299 a = lam / lam_factor
300 this%tha = (b+a)/2.0_rp
301 this%dlt = (b-a)/2.0_rp
302
303 this%recompute_eigs = .false.
304 end associate
305 end subroutine cheby_device_power
306
308 function cheby_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter) &
309 result(ksp_results)
310 class(cheby_device_t), intent(inout) :: this
311 class(ax_t), intent(in) :: ax
312 type(field_t), intent(inout) :: x
313 integer, intent(in) :: n
314 real(kind=rp), dimension(n), intent(in) :: f
315 type(coef_t), intent(inout) :: coef
316 type(bc_list_t), intent(inout) :: blst
317 type(gs_t), intent(inout) :: gs_h
318 type(ksp_monitor_t) :: ksp_results
319 integer, optional, intent(in) :: niter
320 integer :: iter, max_iter
321 real(kind=rp) :: a, b, rtr, rnorm, norm_fac
322 type(c_ptr) :: f_d
323
324 f_d = device_get_ptr(f)
325
326 if (this%recompute_eigs) then
327 call cheby_device_power(this, ax, x, n, coef, blst, gs_h)
328 end if
329
330 if (present(niter)) then
331 max_iter = niter
332 else
333 max_iter = this%max_iter
334 end if
335 norm_fac = 1.0_rp / sqrt(coef%volume)
336
337 associate( w => this%w, r => this%r, d => this%d, &
338 w_d => this%w_d, r_d => this%r_d, d_d => this%d_d)
339 ! calculate residual
340 call device_copy(r_d, f_d, n)
341 call ax%compute(w, x%x, coef, x%msh, x%Xh)
342 call gs_h%op(w, n, gs_op_add, this%gs_event)
343 call blst%apply(w, n)
344 call device_sub2(r_d, w_d, n)
345
346 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
347 rnorm = sqrt(rtr) * norm_fac
348 ksp_results%res_start = rnorm
349 ksp_results%res_final = rnorm
350 ksp_results%iter = 0
351
352 ! First iteration
353 call this%M%solve(w, r, n)
354 call device_copy(d_d, w_d, n)
355 a = 2.0_rp / this%tha
356 call device_add2s2(x%x_d, d_d, a, n)! x = x + a*d
357
358 ! Rest of the iterations
359 do iter = 2, max_iter
360 ! calculate residual
361 call device_copy(r_d, f_d, n)
362 call ax%compute(w, x%x, coef, x%msh, x%Xh)
363 call gs_h%op(w, n, gs_op_add, this%gs_event)
364 call blst%apply(w, n)
365 call device_sub2(r_d, w_d, n)
366
367 call this%M%solve(w, r, n)
368
369 if (iter .eq. 2) then
370 b = 0.5_rp * (this%dlt * a)**2
371 else
372 b = (this%dlt * a / 2.0_rp)**2
373 end if
374 a = 1.0_rp/(this%tha - b/a)
375 call device_add2s1(d_d, w_d, b, n)! d = w + b*d
376
377 call device_add2s2(x%x_d, d_d, a, n)! x = x + a*d
378 end do
379
380 ! calculate residual
381 call device_copy(r_d, f_d, n)
382 call ax%compute(w, x%x, coef, x%msh, x%Xh)
383 call gs_h%op(w, n, gs_op_add, this%gs_event)
384 call blst%apply(w, n)
385 call device_sub2(r_d, w_d, n)
386 rtr = device_glsc3(r_d, coef%mult_d, r_d, n)
387 rnorm = sqrt(rtr) * norm_fac
388
389
390 ksp_results%res_final = rnorm
391 ksp_results%iter = iter
392 ksp_results%converged = this%is_converged(iter, rnorm)
393 end associate
394 end function cheby_device_solve
395
397 function cheby_device_impl(this, Ax, x, f, n, coef, blst, gs_h, niter) &
398 result(ksp_results)
399 class(cheby_device_t), intent(inout) :: this
400 class(ax_t), intent(in) :: ax
401 type(field_t), intent(inout) :: x
402 integer, intent(in) :: n
403 real(kind=rp), dimension(n), intent(in) :: f
404 type(coef_t), intent(inout) :: coef
405 type(bc_list_t), intent(inout) :: blst
406 type(gs_t), intent(inout) :: gs_h
407 type(ksp_monitor_t) :: ksp_results
408 integer, optional, intent(in) :: niter
409 integer :: iter, max_iter
410 real(kind=rp) :: a, b, rtr, rnorm, norm_fac
411 real(kind=rp) :: rhok, rhokp1, sig1, tmp1, tmp2
412 type(c_ptr) :: f_d
413
414 f_d = device_get_ptr(f)
415
416 if (this%recompute_eigs) then
417 call cheby_device_power(this, ax, x, n, coef, blst, gs_h)
418 end if
419
420 if (present(niter)) then
421 max_iter = niter
422 else
423 max_iter = this%max_iter
424 end if
425 norm_fac = 1.0_rp / sqrt(coef%volume)
426
427 associate( w => this%w, r => this%r, d => this%d, &
428 w_d => this%w_d, r_d => this%r_d, d_d => this%d_d)
429 ! calculate residual
430 if (.not.this%zero_initial_guess) then
431 call ax%compute(w, x%x, coef, x%msh, x%Xh)
432 call gs_h%op(w, n, gs_op_add, this%gs_event)
433 call blst%apply(w, n)
434 call device_sub3(r_d, f_d, w_d, n)
435 else
436 call device_copy(r_d, f_d, n)
437 this%zero_initial_guess = .false.
438 end if
439
440 ! First iteration
441 if (associated(this%schwarz)) then
442 call this%schwarz%compute(d, r)
443 else
444 call this%M%solve(d, r, n)
445 end if
446
447 tmp1 = 1.0_rp / this%tha
448 call cheby_device_part1(d_d, x%x_d, tmp1, n)
449
450 sig1 = this%tha / this%dlt
451 rhok = 1.0_rp / sig1
452
453 ! Rest of the iterations
454 do iter = 2, max_iter
455 rhokp1 = 1.0_rp / (2.0_rp * sig1 - rhok)
456 tmp1 = rhokp1 * rhok
457 tmp2 = 2.0_rp * rhokp1 / this%dlt
458 rhok = rhokp1
459 ! calculate residual
460 call ax%compute(w, x%x, coef, x%msh, x%Xh)
461 call gs_h%op(w, n, gs_op_add, this%gs_event)
462 call blst%apply(w, n)
463 call device_sub3(r_d, f_d, w_d, n)
464
465 if (associated(this%schwarz)) then
466 call this%schwarz%compute(w, r)
467 else
468 call this%M%solve(w, r, n)
469 end if
470
471 call cheby_device_part2(d_d, w_d, x%x_d, tmp1, tmp2, n)
472
473 end do
474
475 end associate
476 end function cheby_device_impl
477
479 function cheby_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
480 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
481 class(cheby_device_t), intent(inout) :: this
482 class(ax_t), intent(in) :: ax
483 type(field_t), intent(inout) :: x
484 type(field_t), intent(inout) :: y
485 type(field_t), intent(inout) :: z
486 integer, intent(in) :: n
487 real(kind=rp), dimension(n), intent(in) :: fx
488 real(kind=rp), dimension(n), intent(in) :: fy
489 real(kind=rp), dimension(n), intent(in) :: fz
490 type(coef_t), intent(inout) :: coef
491 type(bc_list_t), intent(inout) :: blstx
492 type(bc_list_t), intent(inout) :: blsty
493 type(bc_list_t), intent(inout) :: blstz
494 type(gs_t), intent(inout) :: gs_h
495 type(ksp_monitor_t), dimension(3) :: ksp_results
496 integer, optional, intent(in) :: niter
497
498 ksp_results(1) = this%solve(ax, x, fx, n, coef, blstx, gs_h, niter)
499 ksp_results(2) = this%solve(ax, y, fy, n, coef, blsty, gs_h, niter)
500 ksp_results(3) = this%solve(ax, z, fz, n, coef, blstz, gs_h, niter)
501
502 end function cheby_device_solve_coupled
503
504end module cheby_device
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
Map a Fortran array to a device (allocate and associate)
Definition device.F90:77
Copy data between host and device (or device and device)
Definition device.F90:71
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Chebyshev preconditioner.
subroutine cheby_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a standard solver.
subroutine cheby_device_part2(d_d, w_d, x_d, tmp1, tmp2, n)
type(ksp_monitor_t) function, dimension(3) cheby_device_solve_coupled(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Standard Cheby_Deviceshev coupled solve.
type(ksp_monitor_t) function cheby_device_solve(this, ax, x, f, n, coef, blst, gs_h, niter)
A chebyshev preconditioner.
subroutine cheby_device_free(this)
subroutine cheby_device_part1(d_d, x_d, inv_tha, n)
subroutine cheby_device_power(this, ax, x, n, coef, blst, gs_h)
type(ksp_monitor_t) function cheby_device_impl(this, ax, x, f, n, coef, blst, gs_h, niter)
A chebyshev preconditioner.
Coefficients.
Definition coef.f90:34
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
integer, parameter, public host_to_device
Definition device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1279
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
Defines a field.
Definition field.f90:34
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
Defines a mesh.
Definition mesh.f90:34
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Overlapping schwarz solves.
Definition schwarz.f90:61
Defines a function space.
Definition space.f90:34
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:48
Defines a Chebyshev preconditioner.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:55
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:73
Defines a canonical Krylov preconditioner.
Definition precon.f90:40
The function space for the SEM solution fields.
Definition space.f90:63