Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
device_math.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
34 use, intrinsic :: iso_c_binding, only: c_ptr, c_int
35 use num_types, only : rp, c_rp
36 use utils, only : neko_error
38 use mpi_f08, only : mpi_sum, mpi_min, mpi_max, mpi_in_place, mpi_allreduce
39 use device, only : glb_cmd_queue
40 ! ========================================================================== !
41 ! Device math interfaces
42
43 use hip_math
44 use cuda_math
45 use opencl_math
46
47 implicit none
48 private
49
50 interface device_cadd
51 module procedure device_radd, device_iadd
52 end interface device_cadd
53
71
72contains
73
75 subroutine device_copy(a_d, b_d, n, strm)
76 type(c_ptr) :: a_d, b_d
77 integer :: n
78 type(c_ptr), optional :: strm
79 type(c_ptr) :: strm_
80
81 if (n .lt. 1) return
82
83 if (present(strm)) then
84 strm_ = strm
85 else
86 strm_ = glb_cmd_queue
87 end if
88
89#if HAVE_HIP
90 call hip_copy(a_d, b_d, n, strm_)
91#elif HAVE_CUDA
92 call cuda_copy(a_d, b_d, n, strm_)
93#elif HAVE_OPENCL
94 call opencl_copy(a_d, b_d, n, strm_)
95#else
96 call neko_error('no device backend configured')
97#endif
98 end subroutine device_copy
99
101 subroutine device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
102 type(c_ptr) :: a_d, b_d, mask_d
103 integer :: n, n_mask
104 type(c_ptr), optional :: strm
105 type(c_ptr) :: strm_
106
107 if (n .lt. 1 .or. n_mask .lt. 1) return
108
109 if (present(strm)) then
110 strm_ = strm
111 else
112 strm_ = glb_cmd_queue
113 end if
114
115#if HAVE_HIP
116 call hip_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
117#elif HAVE_CUDA
118 call cuda_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
119#elif HAVE_OPENCL
120 call opencl_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
121#else
122 call neko_error('no device backend configured')
123#endif
124 end subroutine device_masked_copy_0
125
127 subroutine device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
128 type(c_ptr) :: a_d, b_d, mask_d
129 integer :: n, n_mask
130 type(c_ptr), optional :: strm
131 type(c_ptr) :: strm_
132
133 if (n .lt. 1 .or. n_mask .lt. 1) return
134
135 if (present(strm)) then
136 strm_ = strm
137 else
138 strm_ = glb_cmd_queue
139 end if
140
141#if HAVE_HIP
142 call hip_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
143#elif HAVE_CUDA
144 call cuda_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
145#elif HAVE_OPENCL
146 call opencl_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
147#else
148 call neko_error('no device backend configured')
149#endif
150 end subroutine device_masked_gather_copy_0
151
153 subroutine device_face_masked_gather_copy_0(a_d, b_d, mask_d, facet_d, n1, &
154 n2, lx, ly, lz, n_mask, strm)
155 type(c_ptr) :: a_d, b_d, mask_d, facet_d
156 integer :: n1, n2, lx, ly, lz, n_mask
157 type(c_ptr), optional :: strm
158 type(c_ptr) :: strm_
159
160 if (n_mask .lt. 1) return
161
162 if (present(strm)) then
163 strm_ = strm
164 else
165 strm_ = glb_cmd_queue
166 end if
167
168#if HAVE_HIP
169 call hip_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, lx, &
170 ly, lz, n_mask, strm_)
171#elif HAVE_CUDA
172 call cuda_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, lx, &
173 ly, lz, n_mask, strm_)
174#elif HAVE_OPENCL
175 call opencl_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, &
176 lx, ly, lz, n_mask, strm_)
177#else
178 call neko_error('no device backend configured')
179#endif
181
183 ! In this case, the mask comes from a mask_t type
184 subroutine device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
185 type(c_ptr) :: a_d, b_d, mask_d
186 integer :: n, n_mask
187 type(c_ptr), optional :: strm
188 type(c_ptr) :: strm_
189
190 if (n .lt. 1 .or. n_mask .lt. 1) return
191
192 if (present(strm)) then
193 strm_ = strm
194 else
195 strm_ = glb_cmd_queue
196 end if
197
198#if HAVE_HIP
199 call hip_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
200#elif HAVE_CUDA
201 call cuda_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
202#elif HAVE_OPENCL
203 call opencl_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
204#else
205 call neko_error('no device backend configured')
206#endif
208
210 subroutine device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
211 type(c_ptr) :: a_d, b_d, mask_d
212 integer :: n, n_mask
213 type(c_ptr), optional :: strm
214 type(c_ptr) :: strm_
215
216 if (n .lt. 1 .or. n_mask .lt. 1) return
217
218 if (present(strm)) then
219 strm_ = strm
220 else
221 strm_ = glb_cmd_queue
222 end if
223
224#if HAVE_HIP
225 call hip_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
226#elif HAVE_CUDA
227 call cuda_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
228#elif HAVE_OPENCL
229 call opencl_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
230#else
231 call neko_error('no device backend configured')
232#endif
233 end subroutine device_masked_scatter_copy_0
234
236 ! In this case, the mask comes from a mask_t type
237 subroutine device_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
238 type(c_ptr) :: a_d, b_d, mask_d
239 integer :: n, n_mask
240 type(c_ptr), optional :: strm
241 type(c_ptr) :: strm_
242
243 if (n .lt. 1 .or. n_mask .lt. 1) return
244
245 if (present(strm)) then
246 strm_ = strm
247 else
248 strm_ = glb_cmd_queue
249 end if
250
251#if HAVE_HIP
252 call hip_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
253#elif HAVE_CUDA
254 call cuda_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
255#elif HAVE_OPENCL
256 call opencl_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
257#else
258 call neko_error('no device backend configured')
259#endif
261
262 subroutine device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
263 type(c_ptr) :: a_d, b_d, mask_d
264 integer :: n, n_mask
265 type(c_ptr), optional :: strm
266 type(c_ptr) :: strm_
267
268 if (n .lt. 1 .or. n_mask .lt. 1) return
269
270 if (present(strm)) then
271 strm_ = strm
272 else
273 strm_ = glb_cmd_queue
274 end if
275
276#if HAVE_HIP
277 call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
278#elif HAVE_CUDA
279 call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
280#elif HAVE_OPENCL
281 call neko_error('No OpenCL bcknd, masked atomic reduction')
282#else
283 call neko_error('no device backend configured')
284#endif
286
289 subroutine device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
290 type(c_ptr) :: a_d
291 real(kind=rp), intent(in) :: c
292 integer :: n
293 type(c_ptr) :: mask_d
294 integer :: n_mask
295 type(c_ptr), optional :: strm
296 type(c_ptr) :: strm_
297
298 if (n .lt. 1 .or. n_mask .lt. 1) return
299
300 if (present(strm)) then
301 strm_ = strm
302 else
303 strm_ = glb_cmd_queue
304 end if
305
306#if HAVE_HIP
307 call hip_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
308#elif HAVE_CUDA
309 call cuda_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
310#elif HAVE_OPENCL
311 call opencl_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
312#else
313 call neko_error('No device backend configured')
314#endif
315 end subroutine device_cfill_mask
316
318 subroutine device_rzero(a_d, n, strm)
319 type(c_ptr) :: a_d
320 integer :: n
321 type(c_ptr), optional :: strm
322 type(c_ptr) :: strm_
323
324 if (n .lt. 1) return
325
326 if (present(strm)) then
327 strm_ = strm
328 else
329 strm_ = glb_cmd_queue
330 end if
331
332#if HAVE_HIP
333 call hip_rzero(a_d, n, strm_)
334#elif HAVE_CUDA
335 call cuda_rzero(a_d, n, strm_)
336#elif HAVE_OPENCL
337 call opencl_rzero(a_d, n, strm_)
338#else
339 call neko_error('No device backend configured')
340#endif
341 end subroutine device_rzero
342
344 subroutine device_rone(a_d, n, strm)
345 type(c_ptr) :: a_d
346 integer :: n
347 type(c_ptr), optional :: strm
348 type(c_ptr) :: strm_
349 real(kind=rp), parameter :: one = 1.0_rp
350
351 if (n .lt. 1) return
352
353 if (present(strm)) then
354 strm_ = strm
355 else
356 strm_ = glb_cmd_queue
357 end if
358
359#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL
360 call device_cfill(a_d, one, n, strm_)
361#else
362 call neko_error('No device backend configured')
363#endif
364 end subroutine device_rone
365
367 subroutine device_cmult(a_d, c, n, strm)
368 type(c_ptr) :: a_d
369 real(kind=rp), intent(in) :: c
370 integer :: n
371 type(c_ptr), optional :: strm
372 type(c_ptr) :: strm_
373
374 if (n .lt. 1) return
375
376 if (present(strm)) then
377 strm_ = strm
378 else
379 strm_ = glb_cmd_queue
380 end if
381
382#if HAVE_HIP
383 call hip_cmult(a_d, c, n, strm_)
384#elif HAVE_CUDA
385 call cuda_cmult(a_d, c, n, strm_)
386#elif HAVE_OPENCL
387 call opencl_cmult(a_d, c, n, strm_)
388#else
389 call neko_error('No device backend configured')
390#endif
391 end subroutine device_cmult
392
394 subroutine device_cmult2(a_d, b_d, c, n, strm)
395 type(c_ptr) :: a_d, b_d
396 real(kind=rp), intent(in) :: c
397 integer :: n
398 type(c_ptr), optional :: strm
399 type(c_ptr) :: strm_
400
401 if (n .lt. 1) return
402
403 if (present(strm)) then
404 strm_ = strm
405 else
406 strm_ = glb_cmd_queue
407 end if
408
409#if HAVE_HIP
410 call hip_cmult2(a_d, b_d, c, n, strm_)
411#elif HAVE_CUDA
412 call cuda_cmult2(a_d, b_d, c, n, strm_)
413#elif HAVE_OPENCL
414 call opencl_cmult2(a_d, b_d, c, n, strm_)
415#else
416 call neko_error('No device backend configured')
417#endif
418 end subroutine device_cmult2
419
421 subroutine device_cdiv(a_d, c, n, strm)
422 type(c_ptr) :: a_d
423 real(kind=rp), intent(in) :: c
424 integer :: n
425 type(c_ptr), optional :: strm
426 type(c_ptr) :: strm_
427
428 if (present(strm)) then
429 strm_ = strm
430 else
431 strm_ = glb_cmd_queue
432 end if
433
434#if HAVE_HIP
435 call hip_cdiv(a_d, c, n, strm_)
436#elif HAVE_CUDA
437 call cuda_cdiv(a_d, c, n, strm_)
438#elif HAVE_OPENCL
439 call opencl_cdiv(a_d, c, n, strm_)
440#else
441 call neko_error('No device backend configured')
442#endif
443 end subroutine device_cdiv
444
446 subroutine device_cdiv2(a_d, b_d, c, n, strm)
447 type(c_ptr) :: a_d, b_d
448 real(kind=rp), intent(in) :: c
449 integer :: n
450 type(c_ptr), optional :: strm
451 type(c_ptr) :: strm_
452
453 if (present(strm)) then
454 strm_ = strm
455 else
456 strm_ = glb_cmd_queue
457 end if
458
459#if HAVE_HIP
460 call hip_cdiv2(a_d, b_d, c, n, strm_)
461#elif HAVE_CUDA
462 call cuda_cdiv2(a_d, b_d, c, n, strm_)
463#elif HAVE_OPENCL
464 call opencl_cdiv2(a_d, b_d, c, n, strm_)
465#else
466 call neko_error('No device backend configured')
467#endif
468 end subroutine device_cdiv2
469
471 subroutine device_radd(a_d, c, n, strm)
472 type(c_ptr) :: a_d
473 real(kind=rp), intent(in) :: c
474 integer :: n
475 type(c_ptr), optional :: strm
476 type(c_ptr) :: strm_
477
478 if (n .lt. 1) return
479
480 if (present(strm)) then
481 strm_ = strm
482 else
483 strm_ = glb_cmd_queue
484 end if
485
486#if HAVE_HIP
487 call hip_radd(a_d, c, n, strm_)
488#elif HAVE_CUDA
489 call cuda_radd(a_d, c, n, strm_)
490#elif HAVE_OPENCL
491 call opencl_radd(a_d, c, n, strm_)
492#else
493 call neko_error('No device backend configured')
494#endif
495 end subroutine device_radd
496
498 subroutine device_cadd2(a_d, b_d, c, n, strm)
499 type(c_ptr) :: a_d
500 type(c_ptr) :: b_d
501 real(kind=rp), intent(in) :: c
502 integer :: n
503 type(c_ptr), optional :: strm
504 type(c_ptr) :: strm_
505
506 if (n .lt. 1) return
507
508 if (present(strm)) then
509 strm_ = strm
510 else
511 strm_ = glb_cmd_queue
512 end if
513
514#if HAVE_HIP
515 call hip_cadd2(a_d, b_d, c, n, strm_)
516#elif HAVE_CUDA
517 call cuda_cadd2(a_d, b_d, c, n, strm_)
518#elif HAVE_OPENCL
519 call opencl_cadd2(a_d, b_d, c, n, strm_)
520#else
521 call neko_error('No device backend configured')
522#endif
523 end subroutine device_cadd2
524
526 subroutine device_cwrap(a_d, min_val, max_val, n, strm)
527 type(c_ptr) :: a_d
528 real(kind=rp), intent(in) :: min_val, max_val
529 integer :: n
530 type(c_ptr), optional :: strm
531 type(c_ptr) :: strm_
532
533 if (n .lt. 1 .or. max_val .le. min_val) return
534
535 if (present(strm)) then
536 strm_ = strm
537 else
538 strm_ = glb_cmd_queue
539 end if
540
541#if HAVE_HIP
542 call hip_cwrap(a_d, min_val, max_val, n, strm_)
543#elif HAVE_CUDA
544 call cuda_cwrap(a_d, min_val, max_val, n, strm_)
545#elif HAVE_OPENCL
546 call opencl_cwrap(a_d, min_val, max_val, n, strm_)
547#else
548 call neko_error('No device backend configured')
549#endif
550 end subroutine device_cwrap
551
553 subroutine device_cfill(a_d, c, n, strm)
554 type(c_ptr) :: a_d
555 real(kind=rp), intent(in) :: c
556 integer :: n
557 type(c_ptr), optional ::strm
558 type(c_ptr) :: strm_
559
560 if (n .lt. 1) return
561
562 if (present(strm)) then
563 strm_ = strm
564 else
565 strm_ = glb_cmd_queue
566 end if
567
568#if HAVE_HIP
569 call hip_cfill(a_d, c, n, strm_)
570#elif HAVE_CUDA
571 call cuda_cfill(a_d, c, n, strm_)
572#elif HAVE_OPENCL
573 call opencl_cfill(a_d, c, n, strm_)
574#else
575 call neko_error('No device backend configured')
576#endif
577 end subroutine device_cfill
578
580 subroutine device_add2(a_d, b_d, n, strm)
581 type(c_ptr) :: a_d, b_d
582 integer :: n
583 type(c_ptr), optional :: strm
584 type(c_ptr) :: strm_
585
586 if (n .lt. 1) return
587
588 if (present(strm)) then
589 strm_ = strm
590 else
591 strm_ = glb_cmd_queue
592 end if
593
594#if HAVE_HIP
595 call hip_add2(a_d, b_d, n, strm_)
596#elif HAVE_CUDA
597 call cuda_add2(a_d, b_d, n, strm_)
598#elif HAVE_OPENCL
599 call opencl_add2(a_d, b_d, n, strm_)
600#else
601 call neko_error('No device backend configured')
602#endif
603 end subroutine device_add2
604
605 subroutine device_add4(a_d, b_d, c_d, d_d, n, strm)
606 type(c_ptr) :: a_d, b_d, c_d, d_d
607 integer :: n
608 type(c_ptr), optional :: strm
609 type(c_ptr) :: strm_
610
611 if (n .lt. 1) return
612
613 if (present(strm)) then
614 strm_ = strm
615 else
616 strm_ = glb_cmd_queue
617 end if
618
619#if HAVE_HIP
620 call hip_add4(a_d, b_d, c_d, d_d, n, strm_)
621#elif HAVE_CUDA
622 call cuda_add4(a_d, b_d, c_d, d_d, n, strm_)
623#elif HAVE_OPENCL
624 call opencl_add4(a_d, b_d, c_d, d_d, n, strm_)
625#else
626 call neko_error('No device backend configured')
627#endif
628 end subroutine device_add4
629
630 subroutine device_add2s1(a_d, b_d, c1, n, strm)
631 type(c_ptr) :: a_d, b_d
632 real(kind=rp) :: c1
633 integer :: n
634 type(c_ptr), optional :: strm
635 type(c_ptr) :: strm_
636
637 if (n .lt. 1) return
638
639 if (present(strm)) then
640 strm_ = strm
641 else
642 strm_ = glb_cmd_queue
643 end if
644
645#if HAVE_HIP
646 call hip_add2s1(a_d, b_d, c1, n, strm_)
647#elif HAVE_CUDA
648 call cuda_add2s1(a_d, b_d, c1, n, strm_)
649#elif HAVE_OPENCL
650 call opencl_add2s1(a_d, b_d, c1, n, strm_)
651#else
652 call neko_error('No device backend configured')
653#endif
654 end subroutine device_add2s1
655
658 subroutine device_add2s2(a_d, b_d, c1, n, strm)
659 type(c_ptr) :: a_d, b_d
660 real(kind=rp) :: c1
661 integer :: n
662 type(c_ptr), optional :: strm
663 type(c_ptr) :: strm_
664
665 if (n .lt. 1) return
666
667 if (present(strm)) then
668 strm_ = strm
669 else
670 strm_ = glb_cmd_queue
671 end if
672
673#if HAVE_HIP
674 call hip_add2s2(a_d, b_d, c1, n, strm_)
675#elif HAVE_CUDA
676 call cuda_add2s2(a_d, b_d, c1, n, strm_)
677#elif HAVE_OPENCL
678 call opencl_add2s2(a_d, b_d, c1, n, strm_)
679#else
680 call neko_error('No device backend configured')
681#endif
682 end subroutine device_add2s2
683
685 subroutine device_addsqr2s2(a_d, b_d, c1, n, strm)
686 type(c_ptr) :: a_d, b_d
687 real(kind=rp) :: c1
688 integer :: n
689 type(c_ptr), optional :: strm
690 type(c_ptr) :: strm_
691
692 if (n .lt. 1) return
693
694 if (present(strm)) then
695 strm_ = strm
696 else
697 strm_ = glb_cmd_queue
698 end if
699
700#if HAVE_HIP
701 call hip_addsqr2s2(a_d, b_d, c1, n, strm_)
702#elif HAVE_CUDA
703 call cuda_addsqr2s2(a_d, b_d, c1, n, strm_)
704#elif HAVE_OPENCL
705 call opencl_addsqr2s2(a_d, b_d, c1, n, strm_)
706#else
707 call neko_error('No device backend configured')
708#endif
709 end subroutine device_addsqr2s2
710
712 subroutine device_add3(a_d, b_d, c_d, n, strm)
713 type(c_ptr) :: a_d, b_d, c_d
714 integer :: n
715 type(c_ptr), optional :: strm
716 type(c_ptr) :: strm_
717
718 if (n .lt. 1) return
719
720 if (present(strm)) then
721 strm_ = strm
722 else
723 strm_ = glb_cmd_queue
724 end if
725
726#if HAVE_HIP
727 call hip_add3(a_d, b_d, c_d, n, strm_)
728#elif HAVE_CUDA
729 call cuda_add3(a_d, b_d, c_d, n, strm_)
730#elif HAVE_OPENCL
731 call opencl_add3(a_d, b_d, c_d, n, strm_)
732#else
733 call neko_error('No device backend configured')
734#endif
735 end subroutine device_add3
736
738 subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n, strm)
739 type(c_ptr) :: a_d, b_d, c_d
740 real(kind=rp) :: c1, c2
741 integer :: n
742 type(c_ptr), optional :: strm
743 type(c_ptr) :: strm_
744
745 if (n .lt. 1) return
746
747 if (present(strm)) then
748 strm_ = strm
749 else
750 strm_ = glb_cmd_queue
751 end if
752
753#if HAVE_HIP
754 call hip_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
755#elif HAVE_CUDA
756 call cuda_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
757#elif HAVE_OPENCL
758 call opencl_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
759#else
760 call neko_error('No device backend configured')
761#endif
762 end subroutine device_add3s2
763
765 subroutine device_add4s3(a_d, b_d, c_d, d_d, c1, c2 , c3, n, strm)
766 type(c_ptr) :: a_d, b_d, c_d, d_d
767 real(kind=rp) :: c1, c2, c3
768 integer :: n
769 type(c_ptr), optional :: strm
770 type(c_ptr) :: strm_
771
772 if (n .lt. 1) return
773
774 if (present(strm)) then
775 strm_ = strm
776 else
777 strm_ = glb_cmd_queue
778 end if
779
780#if HAVE_HIP
781 call hip_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
782#elif HAVE_CUDA
783 call cuda_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
784#elif HAVE_OPENCL
785 call opencl_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
786#else
787 call neko_error('No device backend configured')
788#endif
789 end subroutine device_add4s3
790
792 subroutine device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2 , c3, c4, n, strm)
793 type(c_ptr) :: a_d, b_d, c_d, d_d, e_d
794 real(kind=rp) :: c1, c2, c3, c4
795 integer :: n
796 type(c_ptr), optional :: strm
797 type(c_ptr) :: strm_
798
799 if (n .lt. 1) return
800
801 if (present(strm)) then
802 strm_ = strm
803 else
804 strm_ = glb_cmd_queue
805 end if
806
807#if HAVE_HIP
808 call hip_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
809#elif HAVE_CUDA
810 call cuda_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
811#elif HAVE_OPENCL
812 call opencl_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
813#else
814 call neko_error('No device backend configured')
815#endif
816 end subroutine device_add5s4
817
819 subroutine device_invcol1(a_d, n, strm)
820 type(c_ptr) :: a_d
821 integer :: n
822 type(c_ptr), optional :: strm
823 type(c_ptr) :: strm_
824
825 if (n .lt. 1) return
826
827 if (present(strm)) then
828 strm_ = strm
829 else
830 strm_ = glb_cmd_queue
831 end if
832
833#if HAVE_HIP
834 call hip_invcol1(a_d, n, strm_)
835#elif HAVE_CUDA
836 call cuda_invcol1(a_d, n, strm_)
837#elif HAVE_OPENCL
838 call opencl_invcol1(a_d, n, strm_)
839#else
840 call neko_error('No device backend configured')
841#endif
842 end subroutine device_invcol1
843
845 subroutine device_invcol2(a_d, b_d, n, strm)
846 type(c_ptr) :: a_d, b_d
847 integer :: n
848 type(c_ptr), optional :: strm
849 type(c_ptr) :: strm_
850
851 if (n .lt. 1) return
852
853 if (present(strm)) then
854 strm_ = strm
855 else
856 strm_ = glb_cmd_queue
857 end if
858
859#if HAVE_HIP
860 call hip_invcol2(a_d, b_d, n, strm_)
861#elif HAVE_CUDA
862 call cuda_invcol2(a_d, b_d, n, strm_)
863#elif HAVE_OPENCL
864 call opencl_invcol2(a_d, b_d, n, strm_)
865#else
866 call neko_error('No device backend configured')
867#endif
868 end subroutine device_invcol2
869
871 subroutine device_invcol3(a_d, b_d, c_d, n, strm)
872 type(c_ptr) :: a_d, b_d, c_d
873 integer :: n
874 type(c_ptr), optional :: strm
875 type(c_ptr) :: strm_
876
877 if (present(strm)) then
878 strm_ = strm
879 else
880 strm_ = glb_cmd_queue
881 end if
882
883#ifdef HAVE_HIP
884 call hip_invcol3(a_d, b_d, c_d, n, strm_)
885#elif HAVE_CUDA
886 call cuda_invcol3(a_d, b_d, c_d, n, strm_)
887#elif HAVE_OPENCL
888 ! call opencl_invcol3(a_d, b_d, c_d, n)
889 call neko_error('opencl_invcol3 not implemented')
890#else
891 call neko_error('No device backend configured')
892#endif
893 end subroutine device_invcol3
894
896 subroutine device_col2(a_d, b_d, n, strm)
897 type(c_ptr) :: a_d, b_d
898 integer :: n
899 type(c_ptr), optional :: strm
900 type(c_ptr) :: strm_
901
902 if (present(strm)) then
903 strm_ = strm
904 else
905 strm_ = glb_cmd_queue
906 end if
907
908 if (n .lt. 1) return
909#if HAVE_HIP
910 call hip_col2(a_d, b_d, n, strm_)
911#elif HAVE_CUDA
912 call cuda_col2(a_d, b_d, n, strm_)
913#elif HAVE_OPENCL
914 call opencl_col2(a_d, b_d, n, strm_)
915#else
916 call neko_error('No device backend configured')
917#endif
918 end subroutine device_col2
919
921 subroutine device_col3(a_d, b_d, c_d, n, strm)
922 type(c_ptr) :: a_d, b_d, c_d
923 integer :: n
924 type(c_ptr), optional :: strm
925 type(c_ptr) :: strm_
926
927 if (n .lt. 1) return
928
929 if (present(strm)) then
930 strm_ = strm
931 else
932 strm_ = glb_cmd_queue
933 end if
934
935#if HAVE_HIP
936 call hip_col3(a_d, b_d, c_d, n, strm_)
937#elif HAVE_CUDA
938 call cuda_col3(a_d, b_d, c_d, n, strm_)
939#elif HAVE_OPENCL
940 call opencl_col3(a_d, b_d, c_d, n, strm_)
941#else
942 call neko_error('No device backend configured')
943#endif
944 end subroutine device_col3
945
947 subroutine device_subcol3(a_d, b_d, c_d, n, strm)
948 type(c_ptr) :: a_d, b_d, c_d
949 integer :: n
950 type(c_ptr), optional :: strm
951 type(c_ptr) :: strm_
952
953 if (n .lt. 1) return
954
955 if (present(strm)) then
956 strm_ = strm
957 else
958 strm_ = glb_cmd_queue
959 end if
960
961#if HAVE_HIP
962 call hip_subcol3(a_d, b_d, c_d, n, strm_)
963#elif HAVE_CUDA
964 call cuda_subcol3(a_d, b_d, c_d, n, strm_)
965#elif HAVE_OPENCL
966 call opencl_subcol3(a_d, b_d, c_d, n, strm_)
967#else
968 call neko_error('No device backend configured')
969#endif
970 end subroutine device_subcol3
971
973 subroutine device_sub2(a_d, b_d, n, strm)
974 type(c_ptr) :: a_d, b_d
975 integer :: n
976 type(c_ptr), optional :: strm
977 type(c_ptr) :: strm_
978
979 if (n .lt. 1) return
980
981 if (present(strm)) then
982 strm_ = strm
983 else
984 strm_ = glb_cmd_queue
985 end if
986
987#if HAVE_HIP
988 call hip_sub2(a_d, b_d, n, strm_)
989#elif HAVE_CUDA
990 call cuda_sub2(a_d, b_d, n, strm_)
991#elif HAVE_OPENCL
992 call opencl_sub2(a_d, b_d, n, strm_)
993#else
994 call neko_error('No device backend configured')
995#endif
996 end subroutine device_sub2
997
999 subroutine device_sub3(a_d, b_d, c_d, n, strm)
1000 type(c_ptr) :: a_d, b_d, c_d
1001 integer :: n
1002 type(c_ptr), optional :: strm
1003 type(c_ptr) :: strm_
1004
1005 if (n .lt. 1) return
1006
1007 if (present(strm)) then
1008 strm_ = strm
1009 else
1010 strm_ = glb_cmd_queue
1011 end if
1012
1013#if HAVE_HIP
1014 call hip_sub3(a_d, b_d, c_d, n, strm_)
1015#elif HAVE_CUDA
1016 call cuda_sub3(a_d, b_d, c_d, n, strm_)
1017#elif HAVE_OPENCL
1018 call opencl_sub3(a_d, b_d, c_d, n, strm_)
1019#else
1020 call neko_error('No device backend configured')
1021#endif
1022 end subroutine device_sub3
1023
1025 subroutine device_addcol3(a_d, b_d, c_d, n, strm)
1026 type(c_ptr) :: a_d, b_d, c_d
1027 integer :: n
1028 type(c_ptr), optional :: strm
1029 type(c_ptr) :: strm_
1030
1031 if (n .lt. 1) return
1032
1033 if (present(strm)) then
1034 strm_ = strm
1035 else
1036 strm_ = glb_cmd_queue
1037 end if
1038
1039#if HAVE_HIP
1040 call hip_addcol3(a_d, b_d, c_d, n, strm_)
1041#elif HAVE_CUDA
1042 call cuda_addcol3(a_d, b_d, c_d, n, strm_)
1043#elif HAVE_OPENCL
1044 call opencl_addcol3(a_d, b_d, c_d, n, strm_)
1045#else
1046 call neko_error('No device backend configured')
1047#endif
1048 end subroutine device_addcol3
1049
1051 subroutine device_addcol4(a_d, b_d, c_d, d_d, n, strm)
1052 type(c_ptr) :: a_d, b_d, c_d, d_d
1053 integer :: n
1054 type(c_ptr), optional :: strm
1055 type(c_ptr) :: strm_
1056
1057 if (n .lt. 1) return
1058
1059 if (present(strm)) then
1060 strm_ = strm
1061 else
1062 strm_ = glb_cmd_queue
1063 end if
1064
1065#if HAVE_HIP
1066 call hip_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1067#elif HAVE_CUDA
1068 call cuda_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1069#elif HAVE_OPENCL
1070 call opencl_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1071#else
1072 call neko_error('No device backend configured')
1073#endif
1074 end subroutine device_addcol4
1075
1077 subroutine device_addcol3s2(a_d, b_d, c_d, s, n, strm)
1078 type(c_ptr) :: a_d, b_d, c_d
1079 real(kind=rp) :: s
1080 integer :: n
1081 type(c_ptr), optional :: strm
1082 type(c_ptr) :: strm_
1083
1084 if (n .lt. 1) return
1085
1086 if (present(strm)) then
1087 strm_ = strm
1088 else
1089 strm_ = glb_cmd_queue
1090 end if
1091
1092#if HAVE_HIP
1093 call hip_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1094#elif HAVE_CUDA
1095 call cuda_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1096#elif HAVE_OPENCL
1097 call opencl_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1098#else
1099 call neko_error('No device backend configured')
1100#endif
1101 end subroutine device_addcol3s2
1102
1105 subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
1106 type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d
1107 integer :: n
1108 type(c_ptr), optional :: strm
1109 type(c_ptr) :: strm_
1110
1111 if (n .lt. 1) return
1112
1113 if (present(strm)) then
1114 strm_ = strm
1115 else
1116 strm_ = glb_cmd_queue
1117 end if
1118
1119#if HAVE_HIP
1120 call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1121#elif HAVE_CUDA
1122 call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1123#elif HAVE_OPENCL
1124 call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1125#else
1126 call neko_error('No device backend configured')
1127#endif
1128 end subroutine device_vdot3
1129
1132 subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1133 w1_d, w2_d, w3_d, n, strm)
1134 type(c_ptr) :: u1_d, u2_d, u3_d
1135 type(c_ptr) :: v1_d, v2_d, v3_d
1136 type(c_ptr) :: w1_d, w2_d, w3_d
1137 integer :: n
1138 type(c_ptr), optional :: strm
1139 type(c_ptr) :: strm_
1140
1141 if (n .lt. 1) return
1142
1143 if (present(strm)) then
1144 strm_ = strm
1145 else
1146 strm_ = glb_cmd_queue
1147 end if
1148
1149#if HAVE_HIP
1150 call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1151 w1_d, w2_d, w3_d, n, strm_)
1152#elif HAVE_CUDA
1153 call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1154 w1_d, w2_d, w3_d, n, strm_)
1155#elif HAVE_OPENCL
1156 call opencl_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1157 w1_d, w2_d, w3_d, n, strm_)
1158#else
1159 call neko_error('No device backend configured')
1160#endif
1161 end subroutine device_vcross
1162
1163
1165 function device_vlsc3(u_d, v_d, w_d, n, strm) result(res)
1166 type(c_ptr) :: u_d, v_d, w_d
1167 integer :: n
1168 type(c_ptr), optional :: strm
1169 type(c_ptr) :: strm_
1170 real(kind=rp) :: res
1171
1172 res = 0.0_rp
1173
1174 if (n .lt. 1) return
1175
1176 if (present(strm)) then
1177 strm_ = strm
1178 else
1179 strm_ = glb_cmd_queue
1180 end if
1181
1182#if HAVE_HIP
1183 res = hip_vlsc3(u_d, v_d, w_d, n, strm_)
1184#elif HAVE_CUDA
1185 res = cuda_vlsc3(u_d, v_d, w_d, n, strm_)
1186#elif HAVE_OPENCL
1187 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1188 res = opencl_glsc3(u_d, v_d, w_d, n, strm_)
1189#else
1190 call neko_error('No device backend configured')
1191#endif
1192 end function device_vlsc3
1193
1195 function device_glsc3(a_d, b_d, c_d, n, strm) result(res)
1196 type(c_ptr) :: a_d, b_d, c_d
1197 integer :: n, ierr
1198 type(c_ptr), optional :: strm
1199 type(c_ptr) :: strm_
1200 real(kind=rp) :: res
1201
1202 if (present(strm)) then
1203 strm_ = strm
1204 else
1205 strm_ = glb_cmd_queue
1206 end if
1207
1208 res = 0.0_rp
1209#if HAVE_HIP
1210 res = hip_glsc3(a_d, b_d, c_d, n, strm_)
1211#elif HAVE_CUDA
1212 res = cuda_glsc3(a_d, b_d, c_d, n, strm_)
1213#elif HAVE_OPENCL
1214 res = opencl_glsc3(a_d, b_d, c_d, n, strm_)
1215#else
1216 call neko_error('No device backend configured')
1217#endif
1218
1219#ifndef HAVE_DEVICE_MPI
1220 if (pe_size .gt. 1) then
1221 call mpi_allreduce(mpi_in_place, res, 1, &
1222 mpi_real_precision, mpi_sum, neko_comm, ierr)
1223 end if
1224#endif
1225 end function device_glsc3
1226
1227 subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
1228 type(c_ptr), value :: w_d, v_d_d, mult_d
1229 integer(c_int) :: j, n
1230 real(c_rp) :: h(j)
1231 type(c_ptr), optional :: strm
1232 type(c_ptr) :: strm_
1233 integer :: ierr
1234
1235 if (present(strm)) then
1236 strm_ = strm
1237 else
1238 strm_ = glb_cmd_queue
1239 end if
1240
1241#if HAVE_HIP
1242 call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1243#elif HAVE_CUDA
1244 call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1245#elif HAVE_OPENCL
1246 call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1247#else
1248 call neko_error('No device backend configured')
1249#endif
1250
1251#ifndef HAVE_DEVICE_MPI
1252 if (pe_size .gt. 1) then
1253 call mpi_allreduce(mpi_in_place, h, j, &
1254 mpi_real_precision, mpi_sum, neko_comm, ierr)
1255 end if
1256#endif
1257 end subroutine device_glsc3_many
1258
1259 subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
1260 type(c_ptr), value :: y_d, x_d_d, a_d
1261 integer(c_int) :: j, n
1262 type(c_ptr), optional :: strm
1263 type(c_ptr) :: strm_
1264
1265 if (n .lt. 1) return
1266
1267 if (present(strm)) then
1268 strm_ = strm
1269 else
1270 strm_ = glb_cmd_queue
1271 end if
1272
1273#if HAVE_HIP
1274 call hip_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1275#elif HAVE_CUDA
1276 call cuda_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1277#elif HAVE_OPENCL
1278 call opencl_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1279#else
1280 call neko_error('No device backend configured')
1281#endif
1282 end subroutine device_add2s2_many
1283
1285 function device_glsc2(a_d, b_d, n, strm) result(res)
1286 type(c_ptr) :: a_d, b_d
1287 integer :: n, ierr
1288 real(kind=rp) :: res
1289 type(c_ptr), optional :: strm
1290 type(c_ptr) :: strm_
1291
1292 if (present(strm)) then
1293 strm_ = strm
1294 else
1295 strm_ = glb_cmd_queue
1296 end if
1297
1298 res = 0.0_rp
1299#if HAVE_HIP
1300 res = hip_glsc2(a_d, b_d, n, strm_)
1301#elif HAVE_CUDA
1302 res = cuda_glsc2(a_d, b_d, n, strm_)
1303#elif HAVE_OPENCL
1304 res = opencl_glsc2(a_d, b_d, n, strm_)
1305#else
1306 call neko_error('No device backend configured')
1307#endif
1308
1309#ifndef HAVE_DEVICE_MPI
1310 if (pe_size .gt. 1) then
1311 call mpi_allreduce(mpi_in_place, res, 1, &
1312 mpi_real_precision, mpi_sum, neko_comm, ierr)
1313 end if
1314#endif
1315 end function device_glsc2
1316
1319 function device_glsubnorm(a_d, b_d, n, strm) result(res)
1320 type(c_ptr), intent(in) :: a_d, b_d
1321 integer, intent(in) :: n
1322 integer :: ierr
1323 real(kind=rp) :: res
1324 type(c_ptr), optional :: strm
1325 type(c_ptr) :: strm_
1326
1327 if (present(strm)) then
1328 strm_ = strm
1329 else
1330 strm_ = glb_cmd_queue
1331 end if
1332
1333 res = 0.0_rp
1334#if HAVE_HIP
1335 res = hip_glsubnorm2(a_d, b_d, n, strm_)
1336#elif HAVE_CUDA
1337 res = cuda_glsubnorm2(a_d, b_d, n, strm_)
1338#elif HAVE_OPENCL
1339 res = opencl_glsubnorm2(a_d, b_d, n, strm_)
1340#else
1341 call neko_error('No device backend configured')
1342#endif
1343
1344#ifndef HAVE_DEVICE_MPI
1345 if (pe_size .gt. 1) then
1346 call mpi_allreduce(mpi_in_place, res, 1, &
1347 mpi_real_precision, mpi_sum, neko_comm, ierr)
1348 end if
1349#endif
1350
1351 res = sqrt(res)
1352 end function device_glsubnorm
1353
1355 function device_glsum(a_d, n, strm) result(res)
1356 type(c_ptr) :: a_d
1357 integer :: n, ierr
1358 real(kind=rp) :: res
1359 type(c_ptr), optional :: strm
1360 type(c_ptr) :: strm_
1361
1362 if (present(strm)) then
1363 strm_ = strm
1364 else
1365 strm_ = glb_cmd_queue
1366 end if
1367
1368 res = 0.0_rp
1369#if HAVE_HIP
1370 res = hip_glsum(a_d, n, strm_)
1371#elif HAVE_CUDA
1372 res = cuda_glsum(a_d, n, strm_)
1373#elif HAVE_OPENCL
1374 res = opencl_glsum(a_d, n, strm_)
1375#else
1376 call neko_error('No device backend configured')
1377#endif
1378
1379#ifndef HAVE_DEVICE_MPI
1380 if (pe_size .gt. 1) then
1381 call mpi_allreduce(mpi_in_place, res, 1, &
1382 mpi_real_precision, mpi_sum, neko_comm, ierr)
1383 end if
1384#endif
1385 end function device_glsum
1386
1388 function device_glmax(a_d, n, strm) result(res)
1389 type(c_ptr) :: a_d
1390 integer :: n, ierr
1391 real(kind=rp) :: res, ninf
1392 type(c_ptr), optional :: strm
1393 type(c_ptr) :: strm_
1394
1395 if (n .lt. 1) then
1396 res = -huge(0.0_rp)
1397 return
1398 end if
1399
1400 if (present(strm)) then
1401 strm_ = strm
1402 else
1403 strm_ = glb_cmd_queue
1404 end if
1405
1406 ninf = -huge(0.0_rp)
1407#if HAVE_HIP
1408 res = hip_glmax(a_d, ninf, n, strm_)
1409#elif HAVE_CUDA
1410 res = cuda_glmax(a_d, ninf, n, strm_)
1411#elif HAVE_OPENCL
1412 res = opencl_glmax(a_d, n, strm_)
1413#else
1414 call neko_error('No device backend configured')
1415#endif
1416
1417#ifndef HAVE_DEVICE_MPI
1418 if (pe_size .gt. 1) then
1419 call mpi_allreduce(mpi_in_place, res, 1, &
1420 mpi_real_precision, mpi_max, neko_comm, ierr)
1421 end if
1422#endif
1423 end function device_glmax
1424
1426 function device_glmin(a_d, n, strm) result(res)
1427 type(c_ptr) :: a_d
1428 integer :: n, ierr
1429 real(kind=rp) :: res, pinf
1430 type(c_ptr), optional :: strm
1431 type(c_ptr) :: strm_
1432
1433 if (n .lt. 1) then
1434 res = huge(0.0_rp)
1435 return
1436 end if
1437
1438 if (present(strm)) then
1439 strm_ = strm
1440 else
1441 strm_ = glb_cmd_queue
1442 end if
1443
1444 pinf = huge(0.0_rp)
1445#if HAVE_HIP
1446 res = hip_glmin(a_d, pinf, n, strm_)
1447#elif HAVE_CUDA
1448 res = cuda_glmin(a_d, pinf, n, strm_)
1449#elif HAVE_OPENCL
1450 res = opencl_glmin(a_d, n, strm_)
1451#else
1452 call neko_error('No device backend configured')
1453#endif
1454
1455#ifndef HAVE_DEVICE_MPI
1456 if (pe_size .gt. 1) then
1457 call mpi_allreduce(mpi_in_place, res, 1, &
1458 mpi_real_precision, mpi_min, neko_comm, ierr)
1459 end if
1460#endif
1461 end function device_glmin
1462
1463 subroutine device_absval(a_d, n, strm)
1464 integer, intent(in) :: n
1465 type(c_ptr) :: a_d
1466 type(c_ptr), optional :: strm
1467 type(c_ptr) :: strm_
1468
1469 if (n .lt. 1) return
1470
1471 if (present(strm)) then
1472 strm_ = strm
1473 else
1474 strm_ = glb_cmd_queue
1475 end if
1476
1477#ifdef HAVE_HIP
1478 call hip_absval(a_d, n, strm_)
1479#elif HAVE_CUDA
1480 call cuda_absval(a_d, n, strm_)
1481#elif HAVE_OPENCL
1482 call opencl_absval(a_d, n, strm_)
1483#else
1484 call neko_error('No device backend configured')
1485#endif
1486
1487 end subroutine device_absval
1488
1489 ! ========================================================================== !
1490 ! Device point-wise max
1491
1494 subroutine device_pwmax2(a_d, b_d, n, strm)
1495 type(c_ptr) :: a_d, b_d
1496 integer :: n
1497 type(c_ptr), optional :: strm
1498 type(c_ptr) :: strm_
1499
1500 if (n .lt. 1) return
1501
1502 if (present(strm)) then
1503 strm_ = strm
1504 else
1505 strm_ = glb_cmd_queue
1506 end if
1507
1508#if HAVE_HIP
1509 call hip_pwmax_vec2(a_d, b_d, n, strm_)
1510#elif HAVE_CUDA
1511 call cuda_pwmax_vec2(a_d, b_d, n, strm_)
1512#elif HAVE_OPENCL
1513 call opencl_pwmax_vec2(a_d, b_d, n, strm_)
1514#else
1515 call neko_error('No device backend configured')
1516#endif
1517 end subroutine device_pwmax2
1518
1521 subroutine device_pwmax3(a_d, b_d, c_d, n, strm)
1522 type(c_ptr) :: a_d, b_d, c_d
1523 integer :: n
1524 type(c_ptr), optional :: strm
1525 type(c_ptr) :: strm_
1526
1527 if (n .lt. 1) return
1528
1529 if (present(strm)) then
1530 strm_ = strm
1531 else
1532 strm_ = glb_cmd_queue
1533 end if
1534
1535#if HAVE_HIP
1536 call hip_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1537#elif HAVE_CUDA
1538 call cuda_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1539#elif HAVE_OPENCL
1540 call opencl_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1541#else
1542 call neko_error('No device backend configured')
1543#endif
1544
1545 end subroutine device_pwmax3
1546
1549 subroutine device_cpwmax2(a_d, c, n, strm)
1550 type(c_ptr) :: a_d
1551 real(kind=rp), intent(in) :: c
1552 integer :: n
1553 type(c_ptr), optional :: strm
1554 type(c_ptr) :: strm_
1555
1556 if (n .lt. 1) return
1557
1558 if (present(strm)) then
1559 strm_ = strm
1560 else
1561 strm_ = glb_cmd_queue
1562 end if
1563
1564#if HAVE_HIP
1565 call hip_pwmax_sca2(a_d, c, n, strm_)
1566#elif HAVE_CUDA
1567 call cuda_pwmax_sca2(a_d, c, n, strm_)
1568#elif HAVE_OPENCL
1569 call opencl_pwmax_sca2(a_d, c, n, strm_)
1570#else
1571 call neko_error('No device backend configured')
1572#endif
1573
1574 end subroutine device_cpwmax2
1575
1578 subroutine device_cpwmax3(a_d, b_d, c, n, strm)
1579 type(c_ptr) :: a_d, b_d
1580 real(kind=rp), intent(in) :: c
1581 integer :: n
1582 type(c_ptr), optional :: strm
1583 type(c_ptr) :: strm_
1584
1585 if (n .lt. 1) return
1586
1587 if (present(strm)) then
1588 strm_ = strm
1589 else
1590 strm_ = glb_cmd_queue
1591 end if
1592
1593#if HAVE_HIP
1594 call hip_pwmax_sca3(a_d, b_d, c, n, strm_)
1595#elif HAVE_CUDA
1596 call cuda_pwmax_sca3(a_d, b_d, c, n, strm_)
1597#elif HAVE_OPENCL
1598 call opencl_pwmax_sca3(a_d, b_d, c, n, strm_)
1599#else
1600 call neko_error('No device backend configured')
1601#endif
1602
1603 end subroutine device_cpwmax3
1604
1605 ! ========================================================================== !
1606 ! Device point-wise min
1607
1610 subroutine device_pwmin2(a_d, b_d, n, strm)
1611 type(c_ptr) :: a_d, b_d
1612 integer :: n
1613 type(c_ptr), optional :: strm
1614 type(c_ptr) :: strm_
1615
1616 if (n .lt. 1) return
1617
1618 if (present(strm)) then
1619 strm_ = strm
1620 else
1621 strm_ = glb_cmd_queue
1622 end if
1623
1624#if HAVE_HIP
1625 call hip_pwmin_vec2(a_d, b_d, n, strm_)
1626#elif HAVE_CUDA
1627 call cuda_pwmin_vec2(a_d, b_d, n, strm_)
1628#elif HAVE_OPENCL
1629 call opencl_pwmin_vec2(a_d, b_d, n, strm_)
1630#else
1631 call neko_error('No device backend configured')
1632#endif
1633 end subroutine device_pwmin2
1634
1637 subroutine device_pwmin3(a_d, b_d, c_d, n, strm)
1638 type(c_ptr) :: a_d, b_d, c_d
1639 integer :: n
1640 type(c_ptr), optional :: strm
1641 type(c_ptr) :: strm_
1642
1643 if (n .lt. 1) return
1644
1645 if (present(strm)) then
1646 strm_ = strm
1647 else
1648 strm_ = glb_cmd_queue
1649 end if
1650
1651#if HAVE_HIP
1652 call hip_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1653#elif HAVE_CUDA
1654 call cuda_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1655#elif HAVE_OPENCL
1656 call opencl_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1657#else
1658 call neko_error('No device backend configured')
1659#endif
1660
1661 end subroutine device_pwmin3
1662
1665 subroutine device_cpwmin2(a_d, c, n, strm)
1666 type(c_ptr) :: a_d
1667 real(kind=rp), intent(in) :: c
1668 integer :: n
1669 type(c_ptr), optional :: strm
1670 type(c_ptr) :: strm_
1671
1672 if (n .lt. 1) return
1673
1674 if (present(strm)) then
1675 strm_ = strm
1676 else
1677 strm_ = glb_cmd_queue
1678 end if
1679
1680#if HAVE_HIP
1681 call hip_pwmin_sca2(a_d, c, n, strm_)
1682#elif HAVE_CUDA
1683 call cuda_pwmin_sca2(a_d, c, n, strm_)
1684#elif HAVE_OPENCL
1685 call opencl_pwmin_sca2(a_d, c, n, strm_)
1686#else
1687 call neko_error('No device backend configured')
1688#endif
1689
1690 end subroutine device_cpwmin2
1691
1694 subroutine device_cpwmin3(a_d, b_d, c, n, strm)
1695 type(c_ptr) :: a_d, b_d
1696 real(kind=rp), intent(in) :: c
1697 integer :: n
1698 type(c_ptr), optional :: strm
1699 type(c_ptr) :: strm_
1700
1701 if (n .lt. 1) return
1702
1703 if (present(strm)) then
1704 strm_ = strm
1705 else
1706 strm_ = glb_cmd_queue
1707 end if
1708
1709#if HAVE_HIP
1710 call hip_pwmin_sca3(a_d, b_d, c, n, strm_)
1711#elif HAVE_CUDA
1712 call cuda_pwmin_sca3(a_d, b_d, c, n, strm_)
1713#elif HAVE_OPENCL
1714 call opencl_pwmin_sca3(a_d, b_d, c, n, strm_)
1715#else
1716 call neko_error('No device backend configured')
1717#endif
1718
1719 end subroutine device_cpwmin3
1720
1721 ! ========================================================================== !
1722 ! Integer operations
1723
1725 subroutine device_iadd(a_d, c, n, strm)
1726 type(c_ptr), intent(inout) :: a_d
1727 integer, intent(in) :: c
1728 integer, intent(in) :: n
1729 type(c_ptr), optional :: strm
1730 type(c_ptr) :: strm_
1731 if (n .lt. 1) return
1732
1733 if (present(strm)) then
1734 strm_ = strm
1735 else
1736 strm_ = glb_cmd_queue
1737 end if
1738
1739#if HAVE_HIP
1740 call hip_iadd(a_d, c, n, strm_)
1741#elif HAVE_CUDA
1742 call cuda_iadd(a_d, c, n, strm_)
1743#elif HAVE_OPENCL
1744 call opencl_iadd(a_d, c, n, strm_)
1745#else
1746 call neko_error('No device backend configured')
1747#endif
1748 end subroutine device_iadd
1749
1750
1751end module device_math
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:51
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:43
subroutine, public device_pwmin2(a_d, b_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
subroutine, public device_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm)
Returns .
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
real(kind=rp) function, public device_glmax(a_d, n, strm)
Max of a vector of length n.
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_addcol3(a_d, b_d, c_d, n, strm)
Returns .
real(kind=rp) function, public device_glsum(a_d, n, strm)
Sum a vector of length n.
subroutine, public device_pwmax3(a_d, b_d, c_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine, public device_invcol1(a_d, n, strm)
Invert a vector .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
subroutine, public device_cpwmax3(a_d, b_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_rone(a_d, n, strm)
Set all elements to one.
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm)
Compute a cross product (3-d version) assuming vector components etc.
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
real(kind=rp) function, public device_glsubnorm(a_d, b_d, n, strm)
Returns the norm of the difference of two vectors .
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
subroutine, public device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
Fill a constant to a masked vector. .
subroutine, public device_cadd2(a_d, b_d, c, n, strm)
Add a scalar to vector .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
subroutine, public device_invcol3(a_d, b_d, c_d, n, strm)
Vector division .
subroutine, public device_pwmin3(a_d, b_d, c_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_col2(a_d, b_d, n, strm)
Vector multiplication .
subroutine, public device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n, strm)
Compute multiplication sum .
subroutine, public device_cdiv2(a_d, b_d, c, n, strm)
Division of constant c by array .
subroutine, public device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm)
Returns .
subroutine, public device_add4(a_d, b_d, c_d, d_d, n, strm)
subroutine, public device_cpwmax2(a_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_subcol3(a_d, b_d, c_d, n, strm)
Returns .
subroutine, public device_cdiv(a_d, c, n, strm)
Division of constant c by array .
subroutine, public device_absval(a_d, n, strm)
subroutine device_iadd(a_d, c, n, strm)
Add an integer scalar to vector .
subroutine, public device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
subroutine, public device_invcol2(a_d, b_d, n, strm)
Vector division .
subroutine, public device_addsqr2s2(a_d, b_d, c1, n, strm)
Returns .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cpwmin3(a_d, b_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_cwrap(a_d, min_val, max_val, n, strm)
Wrap value around a range (min, max)
subroutine, public device_face_masked_gather_copy_0(a_d, b_d, mask_d, facet_d, n1, n2, lx, ly, lz, n_mask, strm)
Gather a face-local SEM field .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_col3(a_d, b_d, c_d, n, strm)
Vector multiplication with 3 vectors .
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n, strm)
Returns .
subroutine, public device_cfill(a_d, c, n, strm)
Set all elements to a constant c .
subroutine, public device_cpwmin2(a_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_add3(a_d, b_d, c_d, n, strm)
Vector addition .
real(kind=rp) function, public device_glmin(a_d, n, strm)
Min of a vector of length n.
subroutine, public device_addcol3s2(a_d, b_d, c_d, s, n, strm)
Returns .
subroutine, public device_pwmax2(a_d, b_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine device_radd(a_d, c, n, strm)
Add a scalar to vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35