Neko  0.8.99
A portable framework for high-order spectral element flow simulations
fusedcg_cpld_device.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2024, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
36  use precon, only : pc_t
37  use ax_product, only : ax_t
38  use num_types, only: rp, c_rp
39  use field, only : field_t
40  use coefs, only : coef_t
41  use gather_scatter, only : gs_t, gs_op_add
42  use bc, only : bc_list_t, bc_list_apply
43  use math, only : glsc3, rzero, copy, abscmp
45  use device
46  use comm
47  implicit none
48  private
49 
50  integer, parameter :: device_fusedcg_cpld_p_space = 10
51 
53  type, public, extends(ksp_t) :: fusedcg_cpld_device_t
54  real(kind=rp), allocatable :: w1(:)
55  real(kind=rp), allocatable :: w2(:)
56  real(kind=rp), allocatable :: w3(:)
57  real(kind=rp), allocatable :: r1(:)
58  real(kind=rp), allocatable :: r2(:)
59  real(kind=rp), allocatable :: r3(:)
60  real(kind=rp), allocatable :: z1(:)
61  real(kind=rp), allocatable :: z2(:)
62  real(kind=rp), allocatable :: z3(:)
63  real(kind=rp), allocatable :: tmp(:)
64  real(kind=rp), allocatable :: p1(:,:)
65  real(kind=rp), allocatable :: p2(:,:)
66  real(kind=rp), allocatable :: p3(:,:)
67  real(kind=rp), allocatable :: alpha(:)
68  type(c_ptr) :: w1_d = c_null_ptr
69  type(c_ptr) :: w2_d = c_null_ptr
70  type(c_ptr) :: w3_d = c_null_ptr
71  type(c_ptr) :: r1_d = c_null_ptr
72  type(c_ptr) :: r2_d = c_null_ptr
73  type(c_ptr) :: r3_d = c_null_ptr
74  type(c_ptr) :: z1_d = c_null_ptr
75  type(c_ptr) :: z2_d = c_null_ptr
76  type(c_ptr) :: z3_d = c_null_ptr
77  type(c_ptr) :: alpha_d = c_null_ptr
78  type(c_ptr) :: p1_d_d = c_null_ptr
79  type(c_ptr) :: p2_d_d = c_null_ptr
80  type(c_ptr) :: p3_d_d = c_null_ptr
81  type(c_ptr) :: tmp_d = c_null_ptr
82  type(c_ptr), allocatable :: p1_d(:)
83  type(c_ptr), allocatable :: p2_d(:)
84  type(c_ptr), allocatable :: p3_d(:)
85  type(c_ptr) :: gs_event1 = c_null_ptr
86  type(c_ptr) :: gs_event2 = c_null_ptr
87  type(c_ptr) :: gs_event3 = c_null_ptr
88  contains
89  procedure, pass(this) :: init => fusedcg_cpld_device_init
90  procedure, pass(this) :: free => fusedcg_cpld_device_free
91  procedure, pass(this) :: solve => fusedcg_cpld_device_solve
92  procedure, pass(this) :: solve_coupled => fusedcg_cpld_device_solve_coupled
93  end type fusedcg_cpld_device_t
94 
95 #ifdef HAVE_CUDA
96  interface
97  subroutine cuda_fusedcg_cpld_part1(a1_d, a2_d, a3_d, &
98  b1_d, b2_d, b3_d, tmp_d, n) bind(c, name='cuda_fusedcg_cpld_part1')
99  use, intrinsic :: iso_c_binding
100  import c_rp
101  implicit none
102  type(c_ptr), value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d
103  integer(c_int) :: n
104  end subroutine cuda_fusedcg_cpld_part1
105  end interface
106 
107  interface
108  subroutine cuda_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, &
109  po1_d, po2_d, po3_d, beta, n) bind(c, name='cuda_fusedcg_cpld_update_p')
110  use, intrinsic :: iso_c_binding
111  import c_rp
112  implicit none
113  type(c_ptr), value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
114  type(c_ptr), value :: po1_d, po2_d, po3_d
115  real(c_rp) :: beta
116  integer(c_int) :: n
117  end subroutine cuda_fusedcg_cpld_update_p
118  end interface
119 
120  interface
121  subroutine cuda_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, &
122  alpha, p_cur, n) bind(c, name='cuda_fusedcg_cpld_update_x')
123  use, intrinsic :: iso_c_binding
124  implicit none
125  type(c_ptr), value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
126  integer(c_int) :: p_cur, n
127  end subroutine cuda_fusedcg_cpld_update_x
128  end interface
129 
130  interface
131  real(c_rp) function cuda_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, &
132  c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) &
133  bind(c, name='cuda_fusedcg_cpld_part2')
134  use, intrinsic :: iso_c_binding
135  import c_rp
136  implicit none
137  type(c_ptr), value :: a1_d, a2_d, a3_d, b_d
138  type(c_ptr), value :: c1_d, c2_d, c3_d, alpha_d
139  real(c_rp) :: alpha
140  integer(c_int) :: n, p_cur
141  end function cuda_fusedcg_cpld_part2
142  end interface
143 #elif HAVE_HIP
144 #endif
145 
146 contains
147 
148  subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, &
149  b1_d, b2_d, b3_d, tmp_d, n)
150  type(c_ptr), value :: a1_d, a2_d, a3_d, b1_d, b2_d, b3_d
151  type(c_ptr), value :: tmp_d
152  integer(c_int) :: n
153 #ifdef HAVE_HIP
154 #elif HAVE_CUDA
155  call cuda_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
156 #else
157  call neko_error('No device backend configured')
158 #endif
159  end subroutine device_fusedcg_cpld_part1
160 
161  subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, &
162  po1_d, po2_d, po3_d, beta, n)
163  type(c_ptr), value :: p1_d, p2_d, p3_d, z1_d, z2_d, z3_d
164  type(c_ptr), value :: po1_d, po2_d, po3_d
165  real(c_rp) :: beta
166  integer(c_int) :: n
167 #ifdef HAVE_HIP
168 #elif HAVE_CUDA
169  call cuda_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, &
170  po1_d, po2_d, po3_d, beta, n)
171 #else
172  call neko_error('No device backend configured')
173 #endif
174  end subroutine device_fusedcg_cpld_update_p
175 
176  subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, &
177  p1_d, p2_d, p3_d, alpha, p_cur, n)
178  type(c_ptr), value :: x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha
179  integer(c_int) :: p_cur, n
180 #ifdef HAVE_HIP
181 #elif HAVE_CUDA
182  call cuda_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, &
183  p1_d, p2_d, p3_d, alpha, p_cur, n)
184 #else
185  call neko_error('No device backend configured')
186 #endif
187  end subroutine device_fusedcg_cpld_update_x
188 
189  function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, &
190  c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n) result(res)
191  type(c_ptr), value :: a1_d, a2_d, a3_d, b_d
192  type(c_ptr), value :: c1_d, c2_d, c3_d, alpha_d
193  real(c_rp) :: alpha
194  integer :: n, p_cur
195  real(kind=rp) :: res
196  integer :: ierr
197 #ifdef HAVE_HIP
198 #elif HAVE_CUDA
199  res = cuda_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, &
200  c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
201 #else
202  call neko_error('No device backend configured')
203 #endif
204 
205 #ifndef HAVE_DEVICE_MPI
206  if (pe_size .gt. 1) then
207  call mpi_allreduce(mpi_in_place, res, 1, &
208  mpi_real_precision, mpi_sum, neko_comm, ierr)
209  end if
210 #endif
211 
212  end function device_fusedcg_cpld_part2
213 
215  subroutine fusedcg_cpld_device_init(this, n, max_iter, M, &
216  rel_tol, abs_tol, monitor)
217  class(fusedcg_cpld_device_t), target, intent(inout) :: this
218  class(pc_t), optional, intent(inout), target :: M
219  integer, intent(in) :: n
220  integer, intent(in) :: max_iter
221  real(kind=rp), optional, intent(inout) :: rel_tol
222  real(kind=rp), optional, intent(inout) :: abs_tol
223  logical, optional, intent(in) :: monitor
224  type(c_ptr) :: ptr
225  integer(c_size_t) :: p_size
226  integer :: i
227 
228  call this%free()
229 
230  allocate(this%w1(n))
231  allocate(this%w2(n))
232  allocate(this%w3(n))
233  allocate(this%r1(n))
234  allocate(this%r2(n))
235  allocate(this%r3(n))
236  allocate(this%z1(n))
237  allocate(this%z2(n))
238  allocate(this%z3(n))
239  allocate(this%tmp(n))
240  allocate(this%p1(n, device_fusedcg_cpld_p_space))
241  allocate(this%p2(n, device_fusedcg_cpld_p_space))
242  allocate(this%p3(n, device_fusedcg_cpld_p_space))
243  allocate(this%p1_d(device_fusedcg_cpld_p_space))
244  allocate(this%p2_d(device_fusedcg_cpld_p_space))
245  allocate(this%p3_d(device_fusedcg_cpld_p_space))
246  allocate(this%alpha(device_fusedcg_cpld_p_space))
247 
248  if (present(m)) then
249  this%M => m
250  end if
251 
252  call device_map(this%w1, this%w1_d, n)
253  call device_map(this%w2, this%w2_d, n)
254  call device_map(this%w3, this%w3_d, n)
255  call device_map(this%r1, this%r1_d, n)
256  call device_map(this%r2, this%r2_d, n)
257  call device_map(this%r3, this%r3_d, n)
258  call device_map(this%z1, this%z1_d, n)
259  call device_map(this%z2, this%z2_d, n)
260  call device_map(this%z3, this%z3_d, n)
261  call device_map(this%tmp, this%tmp_d, n)
262  call device_map(this%alpha, this%alpha_d, device_fusedcg_cpld_p_space)
263  do i = 1, device_fusedcg_cpld_p_space+1
264  this%p1_d(i) = c_null_ptr
265  call device_map(this%p1(:,i), this%p1_d(i), n)
266 
267  this%p2_d(i) = c_null_ptr
268  call device_map(this%p2(:,i), this%p2_d(i), n)
269 
270  this%p3_d(i) = c_null_ptr
271  call device_map(this%p3(:,i), this%p3_d(i), n)
272  end do
273 
274  p_size = c_sizeof(c_null_ptr) * (device_fusedcg_cpld_p_space)
275  call device_alloc(this%p1_d_d, p_size)
276  call device_alloc(this%p2_d_d, p_size)
277  call device_alloc(this%p3_d_d, p_size)
278  ptr = c_loc(this%p1_d)
279  call device_memcpy(ptr, this%p1_d_d, p_size, &
280  host_to_device, sync=.false.)
281  ptr = c_loc(this%p2_d)
282  call device_memcpy(ptr, this%p2_d_d, p_size, &
283  host_to_device, sync=.false.)
284  ptr = c_loc(this%p3_d)
285  call device_memcpy(ptr, this%p3_d_d, p_size, &
286  host_to_device, sync=.false.)
287  if (present(rel_tol) .and. present(abs_tol) .and. present(monitor)) then
288  call this%ksp_init(max_iter, rel_tol, abs_tol, monitor = monitor)
289  else if (present(rel_tol) .and. present(abs_tol)) then
290  call this%ksp_init(max_iter, rel_tol, abs_tol)
291  else if (present(monitor) .and. present(abs_tol)) then
292  call this%ksp_init(max_iter, abs_tol = abs_tol, monitor = monitor)
293  else if (present(rel_tol) .and. present(monitor)) then
294  call this%ksp_init(max_iter, rel_tol, monitor = monitor)
295  else if (present(rel_tol)) then
296  call this%ksp_init(max_iter, rel_tol = rel_tol)
297  else if (present(abs_tol)) then
298  call this%ksp_init(max_iter, abs_tol = abs_tol)
299  else if (present(monitor)) then
300  call this%ksp_init(max_iter, monitor = monitor)
301  else
302  call this%ksp_init(max_iter)
303  end if
304 
305  call device_event_create(this%gs_event1, 2)
306  call device_event_create(this%gs_event2, 2)
307  call device_event_create(this%gs_event3, 2)
308 
309  end subroutine fusedcg_cpld_device_init
310 
312  subroutine fusedcg_cpld_device_free(this)
313  class(fusedcg_cpld_device_t), intent(inout) :: this
314  integer :: i
315 
316  call this%ksp_free()
317 
318  if (allocated(this%w1)) then
319  deallocate(this%w1)
320  end if
321 
322  if (allocated(this%w2)) then
323  deallocate(this%w2)
324  end if
325 
326  if (allocated(this%w3)) then
327  deallocate(this%w3)
328  end if
329 
330  if (allocated(this%r1)) then
331  deallocate(this%r1)
332  end if
333 
334  if (allocated(this%r2)) then
335  deallocate(this%r2)
336  end if
337 
338  if (allocated(this%r3)) then
339  deallocate(this%r3)
340  end if
341 
342  if (allocated(this%z1)) then
343  deallocate(this%z1)
344  end if
345 
346  if (allocated(this%z2)) then
347  deallocate(this%z2)
348  end if
349 
350  if (allocated(this%z3)) then
351  deallocate(this%z3)
352  end if
353 
354  if (allocated(this%tmp)) then
355  deallocate(this%tmp)
356  end if
357 
358  if (allocated(this%alpha)) then
359  deallocate(this%alpha)
360  end if
361 
362  if (allocated(this%p1)) then
363  deallocate(this%p1)
364  end if
365 
366  if (allocated(this%p2)) then
367  deallocate(this%p2)
368  end if
369 
370  if (allocated(this%p3)) then
371  deallocate(this%p3)
372  end if
373 
374  if (c_associated(this%w1_d)) then
375  call device_free(this%w1_d)
376  end if
377 
378  if (c_associated(this%w2_d)) then
379  call device_free(this%w2_d)
380  end if
381 
382  if (c_associated(this%w3_d)) then
383  call device_free(this%w3_d)
384  end if
385 
386  if (c_associated(this%r1_d)) then
387  call device_free(this%r1_d)
388  end if
389 
390  if (c_associated(this%r2_d)) then
391  call device_free(this%r2_d)
392  end if
393 
394  if (c_associated(this%r3_d)) then
395  call device_free(this%r3_d)
396  end if
397 
398  if (c_associated(this%z1_d)) then
399  call device_free(this%z1_d)
400  end if
401 
402  if (c_associated(this%z2_d)) then
403  call device_free(this%z2_d)
404  end if
405 
406  if (c_associated(this%z3_d)) then
407  call device_free(this%z3_d)
408  end if
409 
410  if (c_associated(this%alpha_d)) then
411  call device_free(this%alpha_d)
412  end if
413 
414  if (c_associated(this%tmp_d)) then
415  call device_free(this%tmp_d)
416  end if
417 
418  if (allocated(this%p1_d)) then
420  if (c_associated(this%p1_d(i))) then
421  call device_free(this%p1_d(i))
422  end if
423  end do
424  end if
425 
426  if (allocated(this%p2_d)) then
428  if (c_associated(this%p2_d(i))) then
429  call device_free(this%p2_d(i))
430  end if
431  end do
432  end if
433 
434  if (allocated(this%p3_d)) then
436  if (c_associated(this%p3_d(i))) then
437  call device_free(this%p3_d(i))
438  end if
439  end do
440  end if
441 
442  nullify(this%M)
443 
444  if (c_associated(this%gs_event1)) then
445  call device_event_destroy(this%gs_event1)
446  end if
447 
448  if (c_associated(this%gs_event2)) then
449  call device_event_destroy(this%gs_event2)
450  end if
451 
452  if (c_associated(this%gs_event3)) then
453  call device_event_destroy(this%gs_event3)
454  end if
455 
456  end subroutine fusedcg_cpld_device_free
457 
459  function fusedcg_cpld_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, &
460  n, coef, blstx, blsty, blstz, gs_h, niter) result(ksp_results)
461  class(fusedcg_cpld_device_t), intent(inout) :: this
462  class(ax_t), intent(inout) :: ax
463  type(field_t), intent(inout) :: x
464  type(field_t), intent(inout) :: y
465  type(field_t), intent(inout) :: z
466  integer, intent(in) :: n
467  real(kind=rp), dimension(n), intent(inout) :: fx
468  real(kind=rp), dimension(n), intent(inout) :: fy
469  real(kind=rp), dimension(n), intent(inout) :: fz
470  type(coef_t), intent(inout) :: coef
471  type(bc_list_t), intent(inout) :: blstx
472  type(bc_list_t), intent(inout) :: blsty
473  type(bc_list_t), intent(inout) :: blstz
474  type(gs_t), intent(inout) :: gs_h
475  type(ksp_monitor_t), dimension(3) :: ksp_results
476  integer, optional, intent(in) :: niter
477  integer :: iter, max_iter, ierr, i, p_cur, p_prev
478  real(kind=rp) :: rnorm, rtr, norm_fac, rtz1, rtz2
479  real(kind=rp) :: pap, beta
480  type(c_ptr) :: fx_d
481  type(c_ptr) :: fy_d
482  type(c_ptr) :: fz_d
483 
484  fx_d = device_get_ptr(fx)
485  fy_d = device_get_ptr(fy)
486  fz_d = device_get_ptr(fz)
487 
488  if (present(niter)) then
489  max_iter = niter
490  else
491  max_iter = ksp_max_iter
492  end if
493  norm_fac = 1.0_rp / sqrt(coef%volume)
494 
495  associate(w1 => this%w1, w2 => this%w2, w3 => this%w3, r1 => this%r1, &
496  r2 => this%r2, r3 => this%r3, p1 => this%p1, p2 => this%p2, &
497  p3 => this%p3, z1 => this%z1, z2 => this%z2, z3 => this%z3, &
498  tmp_d => this%tmp_d, alpha => this%alpha, alpha_d => this%alpha_d, &
499  w1_d => this%w1_d, w2_d => this%w2_d, w3_d => this%w3_d, &
500  r1_d => this%r1_d, r2_d => this%r2_d, r3_d => this%r3_d, &
501  z1_d => this%z1_d, z2_d => this%z2_d, z3_d => this%z3_d, &
502  p1_d => this%p1_d, p2_d => this%p2_d, p3_d => this%p3_d, &
503  p1_d_d => this%p1_d_d, p2_d_d => this%p2_d_d, p3_d_d => this%p3_d_d)
504 
505  rtz1 = 1.0_rp
507  p_cur = 1
508 
509 
510  call device_rzero(x%x_d, n)
511  call device_rzero(y%x_d, n)
512  call device_rzero(z%x_d, n)
513  call device_rzero(p1_d(1), n)
514  call device_rzero(p2_d(1), n)
515  call device_rzero(p3_d(1), n)
516  call device_copy(r1_d, fx_d, n)
517  call device_copy(r2_d, fy_d, n)
518  call device_copy(r3_d, fz_d, n)
519 
520  call device_fusedcg_cpld_part1(r1_d, r2_d, r3_d, r1_d, &
521  r2_d, r3_d, tmp_d, n)
522 
523  rtr = device_glsc3(tmp_d, coef%mult_d, coef%binv_d, n)
524 
525  rnorm = sqrt(rtr)*norm_fac
526  ksp_results%res_start = rnorm
527  ksp_results%res_final = rnorm
528  ksp_results(1)%iter = 0
529  ksp_results(2:3)%iter = -1
530  if(abscmp(rnorm, 0.0_rp)) return
531  call this%monitor_start('fcpldCG')
532  do iter = 1, max_iter
533  call this%M%solve(z1, r1, n)
534  call this%M%solve(z2, r2, n)
535  call this%M%solve(z3, r3, n)
536  rtz2 = rtz1
537 
538  call device_fusedcg_cpld_part1(z1_d, z2_d, z3_d, &
539  r1_d, r2_d, r3_d, tmp_d, n)
540  rtz1 = device_glsc2(tmp_d, coef%mult_d, n)
541 
542  beta = rtz1 / rtz2
543  if (iter .eq. 1) beta = 0.0_rp
544 
545  call device_fusedcg_cpld_update_p(p1_d(p_cur), p2_d(p_cur), p3_d(p_cur), &
546  z1_d, z2_d, z3_d, p1_d(p_prev), p2_d(p_prev), p3_d(p_prev), beta, n)
547 
548  call ax%compute_vector(w1, w2, w3, &
549  p1(1, p_cur), p2(1, p_cur), p3(1, p_cur), coef, x%msh, x%Xh)
550  call gs_h%op(w1, n, gs_op_add, this%gs_event1)
551  call gs_h%op(w2, n, gs_op_add, this%gs_event2)
552  call gs_h%op(w3, n, gs_op_add, this%gs_event3)
553  call device_event_sync(this%gs_event1)
554  call device_event_sync(this%gs_event2)
555  call device_event_sync(this%gs_event3)
556  call bc_list_apply(blstx, w1, n)
557  call bc_list_apply(blsty, w2, n)
558  call bc_list_apply(blstz, w3, n)
559 
560  call device_fusedcg_cpld_part1(w1_d, w2_d, w3_d, p1_d(p_cur), &
561  p2_d(p_cur), p3_d(p_cur), tmp_d, n)
562 
563  pap = device_glsc2(tmp_d, coef%mult_d, n)
564 
565  alpha(p_cur) = rtz1 / pap
566  rtr = device_fusedcg_cpld_part2(r1_d, r2_d, r3_d, coef%mult_d, &
567  w1_d, w2_d, w3_d, alpha_d, alpha(p_cur), p_cur, n)
568  rnorm = sqrt(rtr)*norm_fac
569  call this%monitor_iter(iter, rnorm)
570  if ((p_cur .eq. device_fusedcg_cpld_p_space) .or. &
571  (rnorm .lt. this%abs_tol) .or. iter .eq. max_iter) then
572  call device_fusedcg_cpld_update_x(x%x_d, y%x_d, z%x_d, &
573  p1_d_d, p2_d_d, p3_d_d, alpha_d, p_cur, n)
574  p_prev = p_cur
575  p_cur = 1
576  if (rnorm .lt. this%abs_tol) exit
577  else
578  p_prev = p_cur
579  p_cur = p_cur + 1
580  end if
581  end do
582  call this%monitor_stop()
583  ksp_results%res_final = rnorm
584  ksp_results%iter = iter
585 
586  end associate
587 
589 
591  function fusedcg_cpld_device_solve(this, Ax, x, f, n, coef, blst, &
592  gs_h, niter) result(ksp_results)
593  class(fusedcg_cpld_device_t), intent(inout) :: this
594  class(ax_t), intent(inout) :: ax
595  type(field_t), intent(inout) :: x
596  integer, intent(in) :: n
597  real(kind=rp), dimension(n), intent(inout) :: f
598  type(coef_t), intent(inout) :: coef
599  type(bc_list_t), intent(inout) :: blst
600  type(gs_t), intent(inout) :: gs_h
601  type(ksp_monitor_t) :: ksp_results
602  integer, optional, intent(in) :: niter
603 
604  ! Throw and error
605  call neko_error('Only defined for coupled solves')
606 
607  ksp_results%res_final = 0.0
608  ksp_results%iter = 0
609 
610  end function fusedcg_cpld_device_solve
611 
612 end module fusedcg_cpld_device
613 
614 
Return the device pointer for an associated Fortran array.
Definition: device.F90:81
Map a Fortran array to a device (allocate and associate)
Definition: device.F90:57
Copy data between host and device (or device and device)
Definition: device.F90:51
Defines a Matrix-vector product.
Definition: ax.f90:34
Defines a boundary condition.
Definition: bc.f90:34
Coefficients.
Definition: coef.f90:34
Definition: comm.F90:1
type(mpi_comm) neko_comm
MPI communicator.
Definition: comm.F90:16
type(mpi_datatype) mpi_real_precision
MPI type for working precision of REAL types.
Definition: comm.F90:22
integer pe_size
MPI size of communicator.
Definition: comm.F90:29
subroutine, public device_rzero(a_d, n)
Zero a real vector.
real(kind=rp) function, public device_glsc2(a_d, b_d, n)
Weighted inner product .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n)
Weighted inner product .
subroutine, public device_copy(a_d, b_d, n)
Copy a vector .
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
subroutine, public device_event_sync(event)
Synchronize an event.
Definition: device.F90:1229
integer, parameter, public host_to_device
Definition: device.F90:47
subroutine, public device_free(x_d)
Deallocate memory on the device.
Definition: device.F90:185
subroutine, public device_event_destroy(event)
Destroy a device event.
Definition: device.F90:1191
subroutine, public device_alloc(x_d, s)
Allocate memory on the device.
Definition: device.F90:164
subroutine, public device_event_create(event, flags)
Create a device event queue.
Definition: device.F90:1161
Defines a field.
Definition: field.f90:34
Defines a fused Conjugate Gradient method for accelerators.
subroutine device_fusedcg_cpld_update_x(x1_d, x2_d, x3_d, p1_d, p2_d, p3_d, alpha, p_cur, n)
subroutine fusedcg_cpld_device_init(this, n, max_iter, M, rel_tol, abs_tol, monitor)
Initialise a fused PCG solver.
subroutine fusedcg_cpld_device_free(this)
Deallocate a pipelined PCG solver.
subroutine device_fusedcg_cpld_update_p(p1_d, p2_d, p3_d, z1_d, z2_d, z3_d, po1_d, po2_d, po3_d, beta, n)
type(ksp_monitor_t) function fusedcg_cpld_device_solve(this, Ax, x, f, n, coef, blst, gs_h, niter)
Pipelined PCG solve.
type(ksp_monitor_t) function, dimension(3) fusedcg_cpld_device_solve_coupled(this, Ax, x, y, z, fx, fy, fz, n, coef, blstx, blsty, blstz, gs_h, niter)
Pipelined PCG solve coupled solve.
real(kind=rp) function device_fusedcg_cpld_part2(a1_d, a2_d, a3_d, b_d, c1_d, c2_d, c3_d, alpha_d, alpha, p_cur, n)
integer, parameter device_fusedcg_cpld_p_space
subroutine device_fusedcg_cpld_part1(a1_d, a2_d, a3_d, b1_d, b2_d, b3_d, tmp_d, n)
Gather-scatter.
Implements the base abstract type for Krylov solvers plus helper types.
Definition: krylov.f90:34
integer, parameter, public ksp_max_iter
Maximum number of iters.
Definition: krylov.f90:51
Definition: math.f90:60
real(kind=rp) function, public glsc3(a, b, c, n)
Weighted inner product .
Definition: math.f90:854
subroutine, public copy(a, b, n)
Copy a vector .
Definition: math.f90:228
subroutine, public rzero(a, n)
Zero a real vector.
Definition: math.f90:184
integer, parameter, public c_rp
Definition: num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition: num_types.f90:12
Krylov preconditioner.
Definition: precon.f90:34
Base type for a matrix-vector product providing .
Definition: ax.f90:43
A list of boundary conditions.
Definition: bc.f90:104
Coefficients defined on a given (mesh, ) tuple. Arrays use indices (i,j,k,e): element e,...
Definition: coef.f90:55
Fused preconditioned conjugate gradient method.
Type for storing initial and final residuals in a Krylov solver.
Definition: krylov.f90:56
Base abstract type for a canonical Krylov method, solving .
Definition: krylov.f90:66
Defines a canonical Krylov preconditioner.
Definition: precon.f90:40