Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
cg_cpld_device.f90
Go to the documentation of this file.
1! Copyright (c) 2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use num_types, only: rp
37 use precon, only : pc_t
38 use ax_product, only : ax_t
39 use field, only : field_t
40 use coefs, only : coef_t
41 use gather_scatter, only : gs_t, gs_op_add
42 use bc_list, only : bc_list_t
43 use math, only : abscmp
44 use device
48 use utils, only : neko_error
49 use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr, c_associated
50 implicit none
51
53 type, public, extends(ksp_t) :: cg_cpld_device_t
54 real(kind=rp), allocatable :: w1(:)
55 real(kind=rp), allocatable :: w2(:)
56 real(kind=rp), allocatable :: w3(:)
57 real(kind=rp), allocatable :: r1(:)
58 real(kind=rp), allocatable :: r2(:)
59 real(kind=rp), allocatable :: r3(:)
60 real(kind=rp), allocatable :: p1(:)
61 real(kind=rp), allocatable :: p2(:)
62 real(kind=rp), allocatable :: p3(:)
63 real(kind=rp), allocatable :: z1(:)
64 real(kind=rp), allocatable :: z2(:)
65 real(kind=rp), allocatable :: z3(:)
66 real(kind=rp), allocatable :: tmp(:)
67
68
69 type(c_ptr) :: w1_d = c_null_ptr
70 type(c_ptr) :: w2_d = c_null_ptr
71 type(c_ptr) :: w3_d = c_null_ptr
72
73 type(c_ptr) :: r1_d = c_null_ptr
74 type(c_ptr) :: r2_d = c_null_ptr
75 type(c_ptr) :: r3_d = c_null_ptr
76
77 type(c_ptr) :: p1_d = c_null_ptr
78 type(c_ptr) :: p2_d = c_null_ptr
79 type(c_ptr) :: p3_d = c_null_ptr
80
81 type(c_ptr) :: z1_d = c_null_ptr
82 type(c_ptr) :: z2_d = c_null_ptr
83 type(c_ptr) :: z3_d = c_null_ptr
84
85 type(c_ptr) :: tmp_d = c_null_ptr
86
87 type(c_ptr) :: gs_event = c_null_ptr
88 contains
89 procedure, pass(this) :: init => cg_cpld_device_init
90 procedure, pass(this) :: free => cg_cpld_device_free
91 procedure, pass(this) :: solve => cg_cpld_device_nop
92 procedure, pass(this) :: solve_coupled => cg_cpld_device_solve
93 end type cg_cpld_device_t
94
95contains
96
98 subroutine cg_cpld_device_init(this, n, max_iter, M, rel_tol, abs_tol, monitor)
99 class(cg_cpld_device_t), target, intent(inout) :: this
100 class(pc_t), optional, intent(in), target :: M
101 integer, intent(in) :: n
102 integer, intent(in) :: max_iter
103 real(kind=rp), optional, intent(in) :: rel_tol
104 real(kind=rp), optional, intent(in) :: abs_tol
105 logical, optional, intent(in) :: monitor
106
107 call this%free()
108
109 allocate(this%w1(n))
110 allocate(this%w2(n))
111 allocate(this%w3(n))
112 allocate(this%r1(n))
113 allocate(this%r2(n))
114 allocate(this%r3(n))
115 allocate(this%p1(n))
116 allocate(this%p2(n))
117 allocate(this%p3(n))
118 allocate(this%z1(n))
119 allocate(this%z2(n))
120 allocate(this%z3(n))
121 allocate(this%tmp(n))
122
123 call device_map(this%tmp, this%tmp_d, n)
124 call device_map(this%z1, this%z1_d, n)
125 call device_map(this%z2, this%z2_d, n)
126 call device_map(this%z3, this%z3_d, n)
127 call device_map(this%p1, this%p1_d, n)
128 call device_map(this%p2, this%p2_d, n)
129 call device_map(this%p3, this%p3_d, n)
130 call device_map(this%r1, this%r1_d, n)
131 call device_map(this%r2, this%r2_d, n)
132 call device_map(this%r3, this%r3_d, n)
133 call device_map(this%w1, this%w1_d, n)
134 call device_map(this%w2, this%w2_d, n)
135 call device_map(this%w3, this%w3_d, n)
136
137 if (present(m)) then
138 this%M => m
139 end if
140
141 if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
142 call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
143 else if (present(rel_tol) .and. present(abs_tol)) then
144 call this%ksp_init(max_iter, rel_tol, abs_tol)
145 else if (present(monitor) .and. present(abs_tol)) then
146 call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
147 else if (present(rel_tol) .and. present(monitor)) then
148 call this%ksp_init(max_iter, rel_tol, monitor = monitor)
149 else if (present(rel_tol)) then
150 call this%ksp_init(max_iter, rel_tol = rel_tol)
151 else if (present(abs_tol)) then
152 call this%ksp_init(max_iter, abs_tol = abs_tol)
153 else if (present(monitor)) then
154 call this%ksp_init(max_iter, monitor = monitor)
155 else
156 call this%ksp_init(max_iter)
157 end if
158
159 call device_event_create(this%gs_event, 2)
160 end subroutine cg_cpld_device_init
161
163 subroutine cg_cpld_device_free(this)
164 class(cg_cpld_device_t), intent(inout) :: this
165
166 call this%ksp_free()
167
168 if (allocated(this%w1)) then
169 deallocate(this%w1)
170 end if
171
172 if (allocated(this%w2)) then
173 deallocate(this%w2)
174 end if
175
176 if (allocated(this%w3)) then
177 deallocate(this%w3)
178 end if
179
180 if (allocated(this%r1)) then
181 deallocate(this%r1)
182 end if
183
184 if (allocated(this%r2)) then
185 deallocate(this%r2)
186 end if
187
188 if (allocated(this%r3)) then
189 deallocate(this%r3)
190 end if
191
192 if (allocated(this%p1)) then
193 deallocate(this%p1)
194 end if
195
196 if (allocated(this%p2)) then
197 deallocate(this%p2)
198 end if
199
200 if (allocated(this%p3)) then
201 deallocate(this%p3)
202 end if
203
204 if (allocated(this%z1)) then
205 deallocate(this%z1)
206 end if
207
208 if (allocated(this%z2)) then
209 deallocate(this%z2)
210 end if
211
212 if (allocated(this%z3)) then
213 deallocate(this%z3)
214 end if
215
216 if (allocated(this%tmp)) then
217 deallocate(this%tmp)
218 end if
219
220 nullify(this%M)
221
222 if (c_associated(this%w1_d)) then
223 call device_free(this%w1_d)
224 end if
225
226 if (c_associated(this%w2_d)) then
227 call device_free(this%w2_d)
228 end if
229
230 if (c_associated(this%w3_d)) then
231 call device_free(this%w3_d)
232 end if
233
234 if (c_associated(this%r1_d)) then
235 call device_free(this%r1_d)
236 end if
237
238 if (c_associated(this%r2_d)) then
239 call device_free(this%r2_d)
240 end if
241
242 if (c_associated(this%r3_d)) then
243 call device_free(this%r3_d)
244 end if
245
246 if (c_associated(this%p1_d)) then
247 call device_free(this%p1_d)
248 end if
249
250 if (c_associated(this%p2_d)) then
251 call device_free(this%p2_d)
252 end if
253
254 if (c_associated(this%p3_d)) then
255 call device_free(this%p3_d)
256 end if
257
258 if (c_associated(this%z1_d)) then
259 call device_free(this%z1_d)
260 end if
261
262 if (c_associated(this%z2_d)) then
263 call device_free(this%z2_d)
264 end if
265
266 if (c_associated(this%z3_d)) then
267 call device_free(this%z3_d)
268 end if
269
270 if (c_associated(this%tmp_d)) then
271 call device_free(this%tmp_d)
272 end if
273
274 if (c_associated(this%gs_event)) then
275 call device_event_destroy(this%gs_event)
276 end if
277
278 end subroutine cg_cpld_device_free
279
280 function cg_cpld_device_nop(this, Ax, x, f, n, coef, blst, gs_h, niter) &
281 result(ksp_results)
282 class(cg_cpld_device_t), intent(inout) :: this
283 class(ax_t), intent(in) :: ax
284 type(field_t), intent(inout) :: x
285 integer, intent(in) :: n
286 real(kind=rp), dimension(n), intent(in) :: f
287 type(coef_t), intent(inout) :: coef
288 type(bc_list_t), intent(inout) :: blst
289 type(gs_t), intent(inout) :: gs_h
290 type(ksp_monitor_t) :: ksp_results
291 integer, optional, intent(in) :: niter
292
293 ! Throw and error
294 call neko_error('The cpldcg solver is only defined for coupled solves')
295
296 ksp_results%res_final = 0.0
297 ksp_results%iter = 0
298 end function cg_cpld_device_nop
299
301 function cg_cpld_device_solve(this, Ax, x, y, z, fx, fy, fz, &
302 n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
303 class(cg_cpld_device_t), intent(inout) :: this
304 class(ax_t), intent(in) :: ax
305 type(field_t), intent(inout) :: x
306 type(field_t), intent(inout) :: y
307 type(field_t), intent(inout) :: z
308 integer, intent(in) :: n
309 real(kind=rp), dimension(n), intent(in) :: fx
310 real(kind=rp), dimension(n), intent(in) :: fy
311 real(kind=rp), dimension(n), intent(in) :: fz
312 type(coef_t), intent(inout) :: coef
313 type(bc_list_t), intent(inout) :: blstx
314 type(bc_list_t), intent(inout) :: blsty
315 type(bc_list_t), intent(inout) :: blstz
316 type(gs_t), intent(inout) :: gs_h
317 type(ksp_monitor_t), dimension(3) :: ksp_results
318 integer, optional, intent(in) :: niter
319 integer :: i, iter, max_iter
320 real(kind=rp) :: rnorm, rtr, rtr0, rtz2, rtz1
321 real(kind=rp) :: beta, pap, alpha, alphm, norm_fac
322 integer, parameter :: gdim = 3
323 type(c_ptr) :: fx_d
324 type(c_ptr) :: fy_d
325 type(c_ptr) :: fz_d
326
327 fx_d = device_get_ptr(fx)
328 fy_d = device_get_ptr(fy)
329 fz_d = device_get_ptr(fz)
330
331 if (present(niter)) then
332 max_iter = niter
333 else
334 max_iter = this%max_iter
335 end if
336 norm_fac = 1.0_rp / coef%volume
337
338 associate(p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
339 z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
340 r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
341 w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
342 tmp_d => this%tmp_d)
343
344 rtz1 = 1.0_rp
345 call device_rzero(x%x_d, n)
346 call device_rzero(y%x_d, n)
347 call device_rzero(z%x_d, n)
348 call device_rzero(p1_d, n)
349 call device_rzero(p2_d, n)
350 call device_rzero(p3_d, n)
351 call device_rzero(z1_d, n)
352 call device_rzero(z2_d, n)
353 call device_rzero(z3_d, n)
354 call device_copy(r1_d, fx_d, n)
355 call device_copy(r2_d, fy_d, n)
356 call device_copy(r3_d, fz_d, n)
357 call device_vdot3(tmp_d, r1_d, r2_d, r3_d, r1_d, r2_d, r3_d, n)
358
359
360 rtr = device_glsc3(tmp_d, coef%mult_d, coef%binv_d, n)
361 rnorm = sqrt(rtr)*norm_fac
362 ksp_results%res_start = rnorm
363 ksp_results%res_final = rnorm
364 ksp_results%iter = 0
365 if(abscmp(rnorm, 0.0_rp)) then
366 ksp_results%converged = .true.
367 return
368 end if
369
370 call this%monitor_start('device_cpldCG')
371 do iter = 1, max_iter
372 call this%M%solve(this%z1, this%r1, n)
373 call this%M%solve(this%z2, this%r2, n)
374 call this%M%solve(this%z3, this%r3, n)
375 rtz2 = rtz1
376
377 call device_vdot3(tmp_d, z1_d, z2_d, z3_d, r1_d, r2_d, r3_d, n)
378
379 rtz1 = device_glsc2(tmp_d, coef%mult_d, n)
380
381 beta = rtz1 / rtz2
382 if (iter .eq. 1) beta = 0.0_rp
383 call device_add2s1(p1_d, z1_d, beta, n)
384 call device_add2s1(p2_d, z2_d, beta, n)
385 call device_add2s1(p3_d, z3_d, beta, n)
386
387 call ax%compute_vector(this%w1, this%w2, this%w3, &
388 this%p1, this%p2, this%p3, coef, x%msh, x%Xh)
389 call gs_h%op(this%w1, n, gs_op_add, this%gs_event)
390 call device_event_sync(this%gs_event)
391 call gs_h%op(this%w2, n, gs_op_add, this%gs_event)
392 call device_event_sync(this%gs_event)
393 call gs_h%op(this%w3, n, gs_op_add, this%gs_event)
394 call device_event_sync(this%gs_event)
395
396 call blstx%apply(this%w1, n)
397 call blsty%apply(this%w2, n)
398 call blstz%apply(this%w3, n)
399
400 call device_vdot3(tmp_d, w1_d, w2_d, w3_d, p1_d, p2_d, p3_d, n)
401
402 pap = device_glsc2(tmp_d, coef%mult_d, n)
403
404 alpha = rtz1 / pap
405 alphm = -alpha
406 call device_opadd2cm(x%x_d, y%x_d, z%x_d, &
407 p1_d, p2_d, p3_d, alpha, n, gdim)
408 call device_opadd2cm(r1_d, r2_d, r3_d, &
409 w1_d, w2_d, w3_d, alphm, n, gdim)
410 call device_vdot3(tmp_d, r1_d, r2_d, r3_d, r1_d, r2_d, r3_d, n)
411
412 rtr = device_glsc3(tmp_d, coef%mult_d, coef%binv_d, n)
413 if (iter .eq. 1) rtr0 = rtr
414 rnorm = sqrt(rtr) * norm_fac
415 call this%monitor_iter(iter, rnorm)
416 if (rnorm .lt. this%abs_tol) then
417 exit
418 end if
419 end do
420 end associate
421 call this%monitor_stop()
422 ksp_results%res_final = rnorm
423 ksp_results%iter = iter
424 ksp_results%converged = this%is_converged(iter, rnorm)
425
426 end function cg_cpld_device_solve
427
428end module cg_cpld_device
__device__ T solve(const T u, const T y, const T guess, const T nu, const T kappa, const T B)
Return the device pointer for an associated Fortran array.
Definition device.F90:101
Map a Fortran array to a device (allocate and associate)
Definition device.F90:77
Defines a Matrix-vector product.
Definition ax.f90:34
Defines a list of bc_t.
Definition bc_list.f90:34
Defines a coupled Conjugate Gradient methods for accelerators.
type(ksp_monitor_t) function, dimension(3) cg_cpld_device_solve(this, ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Standard PCG solve.
type(ksp_monitor_t) function cg_cpld_device_nop(this, ax, x, f, n, coef, blst, gs_h, niter)
subroutine cg_cpld_device_free(this)
Deallocate a device based PCG solver.
subroutine cg_cpld_device_init(this, n, max_iter, m, rel_tol, abs_tol, monitor)
Initialise a device based PCG solver.
Coefficients.
Definition coef.f90:34
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_opadd2cm(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, c, n, gdim)
Device abstraction, common interface for various accelerators.
Definition device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition device.F90:1314
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition device.F90:219
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition device.F90:1279
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition device.F90:1249
Defines a field.
Definition field.f90:34
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition krylov.f90:51
Definition math.f90:60
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Krylov preconditioner.
Definition precon.f90:34
Utilities.
Definition utils.f90:35
Base type for a matrix-vector product providing .
Definition ax.f90:43
A list of allocatable `bc_t`. Follows the standard interface of lists.
Definition bc_list.f90:48
Device based coupled preconditioned conjugate gradient method.
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition coef.f90:55
Type for storing initial and final residuals in a Krylov solver.
Definition krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition krylov.f90:73
Defines a canonical Krylov preconditioner.
Definition precon.f90:40