Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
device_math.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
34 use, intrinsic :: iso_c_binding, only: c_ptr, c_int
35 use num_types, only : rp, c_rp
36 use utils, only : neko_error
38 use mpi_f08, only : mpi_sum, mpi_in_place, mpi_allreduce
39 use device, only : glb_cmd_queue
40 ! ========================================================================== !
41 ! Device math interfaces
42
43 use hip_math
44 use cuda_math
45 use opencl_math
46
47 implicit none
48 private
49
50 interface device_cadd
51 module procedure device_radd, device_iadd
52 end interface device_cadd
53
68
69contains
70
72 subroutine device_copy(a_d, b_d, n, strm)
73 type(c_ptr) :: a_d, b_d
74 integer :: n
75 type(c_ptr), optional :: strm
76 type(c_ptr) :: strm_
77
78 if (n .lt. 1) return
79
80 if (present(strm)) then
81 strm_ = strm
82 else
83 strm_ = glb_cmd_queue
84 end if
85
86#if HAVE_HIP
87 call hip_copy(a_d, b_d, n, strm_)
88#elif HAVE_CUDA
89 call cuda_copy(a_d, b_d, n, strm_)
90#elif HAVE_OPENCL
91 call opencl_copy(a_d, b_d, n, strm_)
92#else
93 call neko_error('no device backend configured')
94#endif
95 end subroutine device_copy
96
98 subroutine device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
99 type(c_ptr) :: a_d, b_d, mask_d
100 integer :: n, n_mask
101 type(c_ptr), optional :: strm
102 type(c_ptr) :: strm_
103
104 if (n .lt. 1 .or. n_mask .lt. 1) return
105
106 if (present(strm)) then
107 strm_ = strm
108 else
109 strm_ = glb_cmd_queue
110 end if
111
112#if HAVE_HIP
113 call hip_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
114#elif HAVE_CUDA
115 call cuda_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
116#elif HAVE_OPENCL
117 call opencl_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
118#else
119 call neko_error('no device backend configured')
120#endif
121 end subroutine device_masked_copy_0
122
124 subroutine device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
125 type(c_ptr) :: a_d, b_d, mask_d
126 integer :: n, n_mask
127 type(c_ptr), optional :: strm
128 type(c_ptr) :: strm_
129
130 if (n .lt. 1 .or. n_mask .lt. 1) return
131
132 if (present(strm)) then
133 strm_ = strm
134 else
135 strm_ = glb_cmd_queue
136 end if
137
138#if HAVE_HIP
139 call hip_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
140#elif HAVE_CUDA
141 call cuda_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
142#elif HAVE_OPENCL
143 call opencl_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
144#else
145 call neko_error('no device backend configured')
146#endif
147 end subroutine device_masked_gather_copy_0
148
150 subroutine device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
151 type(c_ptr) :: a_d, b_d, mask_d
152 integer :: n, n_mask
153 type(c_ptr), optional :: strm
154 type(c_ptr) :: strm_
155
156 if (n .lt. 1 .or. n_mask .lt. 1) return
157
158 if (present(strm)) then
159 strm_ = strm
160 else
161 strm_ = glb_cmd_queue
162 end if
163
164#if HAVE_HIP
165 call hip_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
166#elif HAVE_CUDA
167 call cuda_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
168#elif HAVE_OPENCL
169 call opencl_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
170#else
171 call neko_error('no device backend configured')
172#endif
173 end subroutine device_masked_scatter_copy_0
174
175 subroutine device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
176 type(c_ptr) :: a_d, b_d, mask_d
177 integer :: n, n_mask
178 type(c_ptr), optional :: strm
179 type(c_ptr) :: strm_
180
181 if (n .lt. 1 .or. n_mask .lt. 1) return
182
183 if (present(strm)) then
184 strm_ = strm
185 else
186 strm_ = glb_cmd_queue
187 end if
188
189#if HAVE_HIP
190 call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
191#elif HAVE_CUDA
192 call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
193#elif HAVE_OPENCL
194 call neko_error('No OpenCL bcknd, masked atomic reduction')
195#else
196 call neko_error('no device backend configured')
197#endif
199
202 subroutine device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
203 type(c_ptr) :: a_d
204 real(kind=rp), intent(in) :: c
205 integer :: n
206 type(c_ptr) :: mask_d
207 integer :: n_mask
208 type(c_ptr), optional :: strm
209 type(c_ptr) :: strm_
210
211 if (n .lt. 1 .or. n_mask .lt. 1) return
212
213 if (present(strm)) then
214 strm_ = strm
215 else
216 strm_ = glb_cmd_queue
217 end if
218
219#if HAVE_HIP
220 call hip_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
221#elif HAVE_CUDA
222 call cuda_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
223#elif HAVE_OPENCL
224 call opencl_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
225#else
226 call neko_error('No device backend configured')
227#endif
228 end subroutine device_cfill_mask
229
231 subroutine device_rzero(a_d, n, strm)
232 type(c_ptr) :: a_d
233 integer :: n
234 type(c_ptr), optional :: strm
235 type(c_ptr) :: strm_
236
237 if (n .lt. 1) return
238
239 if (present(strm)) then
240 strm_ = strm
241 else
242 strm_ = glb_cmd_queue
243 end if
244
245#if HAVE_HIP
246 call hip_rzero(a_d, n, strm_)
247#elif HAVE_CUDA
248 call cuda_rzero(a_d, n, strm_)
249#elif HAVE_OPENCL
250 call opencl_rzero(a_d, n, strm_)
251#else
252 call neko_error('No device backend configured')
253#endif
254 end subroutine device_rzero
255
257 subroutine device_rone(a_d, n, strm)
258 type(c_ptr) :: a_d
259 integer :: n
260 type(c_ptr), optional :: strm
261 type(c_ptr) :: strm_
262 real(kind=rp), parameter :: one = 1.0_rp
263
264 if (n .lt. 1) return
265
266 if (present(strm)) then
267 strm_ = strm
268 else
269 strm_ = glb_cmd_queue
270 end if
271
272#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL
273 call device_cfill(a_d, one, n, strm_)
274#else
275 call neko_error('No device backend configured')
276#endif
277 end subroutine device_rone
278
280 subroutine device_cmult(a_d, c, n, strm)
281 type(c_ptr) :: a_d
282 real(kind=rp), intent(in) :: c
283 integer :: n
284 type(c_ptr), optional :: strm
285 type(c_ptr) :: strm_
286
287 if (n .lt. 1) return
288
289 if (present(strm)) then
290 strm_ = strm
291 else
292 strm_ = glb_cmd_queue
293 end if
294
295#if HAVE_HIP
296 call hip_cmult(a_d, c, n, strm_)
297#elif HAVE_CUDA
298 call cuda_cmult(a_d, c, n, strm_)
299#elif HAVE_OPENCL
300 call opencl_cmult(a_d, c, n, strm_)
301#else
302 call neko_error('No device backend configured')
303#endif
304 end subroutine device_cmult
305
307 subroutine device_cmult2(a_d, b_d, c, n, strm)
308 type(c_ptr) :: a_d, b_d
309 real(kind=rp), intent(in) :: c
310 integer :: n
311 type(c_ptr), optional :: strm
312 type(c_ptr) :: strm_
313
314 if (n .lt. 1) return
315
316 if (present(strm)) then
317 strm_ = strm
318 else
319 strm_ = glb_cmd_queue
320 end if
321
322#if HAVE_HIP
323 call hip_cmult2(a_d, b_d, c, n, strm_)
324#elif HAVE_CUDA
325 call cuda_cmult2(a_d, b_d, c, n, strm_)
326#elif HAVE_OPENCL
327 call opencl_cmult2(a_d, b_d, c, n, strm_)
328#else
329 call neko_error('No device backend configured')
330#endif
331 end subroutine device_cmult2
332
334 subroutine device_cdiv(a_d, c, n, strm)
335 type(c_ptr) :: a_d
336 real(kind=rp), intent(in) :: c
337 integer :: n
338 type(c_ptr), optional :: strm
339 type(c_ptr) :: strm_
340
341 if (present(strm)) then
342 strm_ = strm
343 else
344 strm_ = glb_cmd_queue
345 end if
346
347#if HAVE_HIP
348 call hip_cdiv(a_d, c, n, strm_)
349#elif HAVE_CUDA
350 call cuda_cdiv(a_d, c, n, strm_)
351#elif HAVE_OPENCL
352 call opencl_cdiv(a_d, c, n, strm_)
353#else
354 call neko_error('No device backend configured')
355#endif
356 end subroutine device_cdiv
357
359 subroutine device_cdiv2(a_d, b_d, c, n, strm)
360 type(c_ptr) :: a_d, b_d
361 real(kind=rp), intent(in) :: c
362 integer :: n
363 type(c_ptr), optional :: strm
364 type(c_ptr) :: strm_
365
366 if (present(strm)) then
367 strm_ = strm
368 else
369 strm_ = glb_cmd_queue
370 end if
371
372#if HAVE_HIP
373 call hip_cdiv2(a_d, b_d, c, n, strm_)
374#elif HAVE_CUDA
375 call cuda_cdiv2(a_d, b_d, c, n, strm_)
376#elif HAVE_OPENCL
377 call opencl_cdiv2(a_d, b_d, c, n, strm_)
378#else
379 call neko_error('No device backend configured')
380#endif
381 end subroutine device_cdiv2
382
384 subroutine device_radd(a_d, c, n, strm)
385 type(c_ptr) :: a_d
386 real(kind=rp), intent(in) :: c
387 integer :: n
388 type(c_ptr), optional :: strm
389 type(c_ptr) :: strm_
390
391 if (n .lt. 1) return
392
393 if (present(strm)) then
394 strm_ = strm
395 else
396 strm_ = glb_cmd_queue
397 end if
398
399#if HAVE_HIP
400 call hip_radd(a_d, c, n, strm_)
401#elif HAVE_CUDA
402 call cuda_radd(a_d, c, n, strm_)
403#elif HAVE_OPENCL
404 call opencl_radd(a_d, c, n, strm_)
405#else
406 call neko_error('No device backend configured')
407#endif
408 end subroutine device_radd
409
411 subroutine device_cadd2(a_d, b_d, c, n, strm)
412 type(c_ptr) :: a_d
413 type(c_ptr) :: b_d
414 real(kind=rp), intent(in) :: c
415 integer :: n
416 type(c_ptr), optional :: strm
417 type(c_ptr) :: strm_
418
419 if (n .lt. 1) return
420
421 if (present(strm)) then
422 strm_ = strm
423 else
424 strm_ = glb_cmd_queue
425 end if
426
427#if HAVE_HIP
428 call hip_cadd2(a_d, b_d, c, n, strm_)
429#elif HAVE_CUDA
430 call cuda_cadd2(a_d, b_d, c, n, strm_)
431#elif HAVE_OPENCL
432 call opencl_cadd2(a_d, b_d, c, n, strm_)
433#else
434 call neko_error('No device backend configured')
435#endif
436 end subroutine device_cadd2
437
439 subroutine device_cfill(a_d, c, n, strm)
440 type(c_ptr) :: a_d
441 real(kind=rp), intent(in) :: c
442 integer :: n
443 type(c_ptr), optional ::strm
444 type(c_ptr) :: strm_
445
446 if (n .lt. 1) return
447
448 if (present(strm)) then
449 strm_ = strm
450 else
451 strm_ = glb_cmd_queue
452 end if
453
454#if HAVE_HIP
455 call hip_cfill(a_d, c, n, strm_)
456#elif HAVE_CUDA
457 call cuda_cfill(a_d, c, n, strm_)
458#elif HAVE_OPENCL
459 call opencl_cfill(a_d, c, n, strm_)
460#else
461 call neko_error('No device backend configured')
462#endif
463 end subroutine device_cfill
464
466 subroutine device_add2(a_d, b_d, n, strm)
467 type(c_ptr) :: a_d, b_d
468 integer :: n
469 type(c_ptr), optional :: strm
470 type(c_ptr) :: strm_
471
472 if (n .lt. 1) return
473
474 if (present(strm)) then
475 strm_ = strm
476 else
477 strm_ = glb_cmd_queue
478 end if
479
480#if HAVE_HIP
481 call hip_add2(a_d, b_d, n, strm_)
482#elif HAVE_CUDA
483 call cuda_add2(a_d, b_d, n, strm_)
484#elif HAVE_OPENCL
485 call opencl_add2(a_d, b_d, n, strm_)
486#else
487 call neko_error('No device backend configured')
488#endif
489 end subroutine device_add2
490
491 subroutine device_add4(a_d, b_d, c_d, d_d, n, strm)
492 type(c_ptr) :: a_d, b_d, c_d, d_d
493 integer :: n
494 type(c_ptr), optional :: strm
495 type(c_ptr) :: strm_
496
497 if (n .lt. 1) return
498
499 if (present(strm)) then
500 strm_ = strm
501 else
502 strm_ = glb_cmd_queue
503 end if
504
505#if HAVE_HIP
506 call hip_add4(a_d, b_d, c_d, d_d, n, strm_)
507#elif HAVE_CUDA
508 call cuda_add4(a_d, b_d, c_d, d_d, n, strm_)
509#elif HAVE_OPENCL
510 call opencl_add4(a_d, b_d, c_d, d_d, n, strm_)
511#else
512 call neko_error('No device backend configured')
513#endif
514 end subroutine device_add4
515
516 subroutine device_add2s1(a_d, b_d, c1, n, strm)
517 type(c_ptr) :: a_d, b_d
518 real(kind=rp) :: c1
519 integer :: n
520 type(c_ptr), optional :: strm
521 type(c_ptr) :: strm_
522
523 if (n .lt. 1) return
524
525 if (present(strm)) then
526 strm_ = strm
527 else
528 strm_ = glb_cmd_queue
529 end if
530
531#if HAVE_HIP
532 call hip_add2s1(a_d, b_d, c1, n, strm_)
533#elif HAVE_CUDA
534 call cuda_add2s1(a_d, b_d, c1, n, strm_)
535#elif HAVE_OPENCL
536 call opencl_add2s1(a_d, b_d, c1, n, strm_)
537#else
538 call neko_error('No device backend configured')
539#endif
540 end subroutine device_add2s1
541
544 subroutine device_add2s2(a_d, b_d, c1, n, strm)
545 type(c_ptr) :: a_d, b_d
546 real(kind=rp) :: c1
547 integer :: n
548 type(c_ptr), optional :: strm
549 type(c_ptr) :: strm_
550
551 if (n .lt. 1) return
552
553 if (present(strm)) then
554 strm_ = strm
555 else
556 strm_ = glb_cmd_queue
557 end if
558
559#if HAVE_HIP
560 call hip_add2s2(a_d, b_d, c1, n, strm_)
561#elif HAVE_CUDA
562 call cuda_add2s2(a_d, b_d, c1, n, strm_)
563#elif HAVE_OPENCL
564 call opencl_add2s2(a_d, b_d, c1, n, strm_)
565#else
566 call neko_error('No device backend configured')
567#endif
568 end subroutine device_add2s2
569
571 subroutine device_addsqr2s2(a_d, b_d, c1, n, strm)
572 type(c_ptr) :: a_d, b_d
573 real(kind=rp) :: c1
574 integer :: n
575 type(c_ptr), optional :: strm
576 type(c_ptr) :: strm_
577
578 if (n .lt. 1) return
579
580 if (present(strm)) then
581 strm_ = strm
582 else
583 strm_ = glb_cmd_queue
584 end if
585
586#if HAVE_HIP
587 call hip_addsqr2s2(a_d, b_d, c1, n, strm_)
588#elif HAVE_CUDA
589 call cuda_addsqr2s2(a_d, b_d, c1, n, strm_)
590#elif HAVE_OPENCL
591 call opencl_addsqr2s2(a_d, b_d, c1, n, strm_)
592#else
593 call neko_error('No device backend configured')
594#endif
595 end subroutine device_addsqr2s2
596
598 subroutine device_add3(a_d, b_d, c_d, n, strm)
599 type(c_ptr) :: a_d, b_d, c_d
600 integer :: n
601 type(c_ptr), optional :: strm
602 type(c_ptr) :: strm_
603
604 if (n .lt. 1) return
605
606 if (present(strm)) then
607 strm_ = strm
608 else
609 strm_ = glb_cmd_queue
610 end if
611
612#if HAVE_HIP
613 call hip_add3(a_d, b_d, c_d, n, strm_)
614#elif HAVE_CUDA
615 call cuda_add3(a_d, b_d, c_d, n, strm_)
616#elif HAVE_OPENCL
617 call opencl_add3(a_d, b_d, c_d, n, strm_)
618#else
619 call neko_error('No device backend configured')
620#endif
621 end subroutine device_add3
622
624 subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n, strm)
625 type(c_ptr) :: a_d, b_d, c_d
626 real(kind=rp) :: c1, c2
627 integer :: n
628 type(c_ptr), optional :: strm
629 type(c_ptr) :: strm_
630
631 if (n .lt. 1) return
632
633 if (present(strm)) then
634 strm_ = strm
635 else
636 strm_ = glb_cmd_queue
637 end if
638
639#if HAVE_HIP
640 call hip_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
641#elif HAVE_CUDA
642 call cuda_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
643#elif HAVE_OPENCL
644 call opencl_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
645#else
646 call neko_error('No device backend configured')
647#endif
648 end subroutine device_add3s2
649
651 subroutine device_add4s3(a_d, b_d, c_d, d_d, c1, c2 , c3, n, strm)
652 type(c_ptr) :: a_d, b_d, c_d, d_d
653 real(kind=rp) :: c1, c2, c3
654 integer :: n
655 type(c_ptr), optional :: strm
656 type(c_ptr) :: strm_
657
658 if (n .lt. 1) return
659
660 if (present(strm)) then
661 strm_ = strm
662 else
663 strm_ = glb_cmd_queue
664 end if
665
666#if HAVE_HIP
667 call hip_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
668#elif HAVE_CUDA
669 call cuda_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
670#elif HAVE_OPENCL
671 call opencl_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
672#else
673 call neko_error('No device backend configured')
674#endif
675 end subroutine device_add4s3
676
678 subroutine device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2 , c3, c4, n, strm)
679 type(c_ptr) :: a_d, b_d, c_d, d_d, e_d
680 real(kind=rp) :: c1, c2, c3, c4
681 integer :: n
682 type(c_ptr), optional :: strm
683 type(c_ptr) :: strm_
684
685 if (n .lt. 1) return
686
687 if (present(strm)) then
688 strm_ = strm
689 else
690 strm_ = glb_cmd_queue
691 end if
692
693#if HAVE_HIP
694 call hip_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
695#elif HAVE_CUDA
696 call cuda_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
697#elif HAVE_OPENCL
698 call opencl_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
699#else
700 call neko_error('No device backend configured')
701#endif
702 end subroutine device_add5s4
703
705 subroutine device_invcol1(a_d, n, strm)
706 type(c_ptr) :: a_d
707 integer :: n
708 type(c_ptr), optional :: strm
709 type(c_ptr) :: strm_
710
711 if (n .lt. 1) return
712
713 if (present(strm)) then
714 strm_ = strm
715 else
716 strm_ = glb_cmd_queue
717 end if
718
719#if HAVE_HIP
720 call hip_invcol1(a_d, n, strm_)
721#elif HAVE_CUDA
722 call cuda_invcol1(a_d, n, strm_)
723#elif HAVE_OPENCL
724 call opencl_invcol1(a_d, n, strm_)
725#else
726 call neko_error('No device backend configured')
727#endif
728 end subroutine device_invcol1
729
731 subroutine device_invcol2(a_d, b_d, n, strm)
732 type(c_ptr) :: a_d, b_d
733 integer :: n
734 type(c_ptr), optional :: strm
735 type(c_ptr) :: strm_
736
737 if (n .lt. 1) return
738
739 if (present(strm)) then
740 strm_ = strm
741 else
742 strm_ = glb_cmd_queue
743 end if
744
745#if HAVE_HIP
746 call hip_invcol2(a_d, b_d, n, strm_)
747#elif HAVE_CUDA
748 call cuda_invcol2(a_d, b_d, n, strm_)
749#elif HAVE_OPENCL
750 call opencl_invcol2(a_d, b_d, n, strm_)
751#else
752 call neko_error('No device backend configured')
753#endif
754 end subroutine device_invcol2
755
757 subroutine device_invcol3(a_d, b_d, c_d, n, strm)
758 type(c_ptr) :: a_d, b_d, c_d
759 integer :: n
760 type(c_ptr), optional :: strm
761 type(c_ptr) :: strm_
762
763 if (present(strm)) then
764 strm_ = strm
765 else
766 strm_ = glb_cmd_queue
767 end if
768
769#ifdef HAVE_HIP
770 call hip_invcol3(a_d, b_d, c_d, n, strm_)
771#elif HAVE_CUDA
772 call cuda_invcol3(a_d, b_d, c_d, n, strm_)
773#elif HAVE_OPENCL
774 ! call opencl_invcol3(a_d, b_d, c_d, n)
775 call neko_error('opencl_invcol3 not implemented')
776#else
777 call neko_error('No device backend configured')
778#endif
779 end subroutine device_invcol3
780
782 subroutine device_col2(a_d, b_d, n, strm)
783 type(c_ptr) :: a_d, b_d
784 integer :: n
785 type(c_ptr), optional :: strm
786 type(c_ptr) :: strm_
787
788 if (present(strm)) then
789 strm_ = strm
790 else
791 strm_ = glb_cmd_queue
792 end if
793
794 if (n .lt. 1) return
795#if HAVE_HIP
796 call hip_col2(a_d, b_d, n, strm_)
797#elif HAVE_CUDA
798 call cuda_col2(a_d, b_d, n, strm_)
799#elif HAVE_OPENCL
800 call opencl_col2(a_d, b_d, n, strm_)
801#else
802 call neko_error('No device backend configured')
803#endif
804 end subroutine device_col2
805
807 subroutine device_col3(a_d, b_d, c_d, n, strm)
808 type(c_ptr) :: a_d, b_d, c_d
809 integer :: n
810 type(c_ptr), optional :: strm
811 type(c_ptr) :: strm_
812
813 if (n .lt. 1) return
814
815 if (present(strm)) then
816 strm_ = strm
817 else
818 strm_ = glb_cmd_queue
819 end if
820
821#if HAVE_HIP
822 call hip_col3(a_d, b_d, c_d, n, strm_)
823#elif HAVE_CUDA
824 call cuda_col3(a_d, b_d, c_d, n, strm_)
825#elif HAVE_OPENCL
826 call opencl_col3(a_d, b_d, c_d, n, strm_)
827#else
828 call neko_error('No device backend configured')
829#endif
830 end subroutine device_col3
831
833 subroutine device_subcol3(a_d, b_d, c_d, n, strm)
834 type(c_ptr) :: a_d, b_d, c_d
835 integer :: n
836 type(c_ptr), optional :: strm
837 type(c_ptr) :: strm_
838
839 if (n .lt. 1) return
840
841 if (present(strm)) then
842 strm_ = strm
843 else
844 strm_ = glb_cmd_queue
845 end if
846
847#if HAVE_HIP
848 call hip_subcol3(a_d, b_d, c_d, n, strm_)
849#elif HAVE_CUDA
850 call cuda_subcol3(a_d, b_d, c_d, n, strm_)
851#elif HAVE_OPENCL
852 call opencl_subcol3(a_d, b_d, c_d, n, strm_)
853#else
854 call neko_error('No device backend configured')
855#endif
856 end subroutine device_subcol3
857
859 subroutine device_sub2(a_d, b_d, n, strm)
860 type(c_ptr) :: a_d, b_d
861 integer :: n
862 type(c_ptr), optional :: strm
863 type(c_ptr) :: strm_
864
865 if (n .lt. 1) return
866
867 if (present(strm)) then
868 strm_ = strm
869 else
870 strm_ = glb_cmd_queue
871 end if
872
873#if HAVE_HIP
874 call hip_sub2(a_d, b_d, n, strm_)
875#elif HAVE_CUDA
876 call cuda_sub2(a_d, b_d, n, strm_)
877#elif HAVE_OPENCL
878 call opencl_sub2(a_d, b_d, n, strm_)
879#else
880 call neko_error('No device backend configured')
881#endif
882 end subroutine device_sub2
883
885 subroutine device_sub3(a_d, b_d, c_d, n, strm)
886 type(c_ptr) :: a_d, b_d, c_d
887 integer :: n
888 type(c_ptr), optional :: strm
889 type(c_ptr) :: strm_
890
891 if (n .lt. 1) return
892
893 if (present(strm)) then
894 strm_ = strm
895 else
896 strm_ = glb_cmd_queue
897 end if
898
899#if HAVE_HIP
900 call hip_sub3(a_d, b_d, c_d, n, strm_)
901#elif HAVE_CUDA
902 call cuda_sub3(a_d, b_d, c_d, n, strm_)
903#elif HAVE_OPENCL
904 call opencl_sub3(a_d, b_d, c_d, n, strm_)
905#else
906 call neko_error('No device backend configured')
907#endif
908 end subroutine device_sub3
909
911 subroutine device_addcol3(a_d, b_d, c_d, n, strm)
912 type(c_ptr) :: a_d, b_d, c_d
913 integer :: n
914 type(c_ptr), optional :: strm
915 type(c_ptr) :: strm_
916
917 if (n .lt. 1) return
918
919 if (present(strm)) then
920 strm_ = strm
921 else
922 strm_ = glb_cmd_queue
923 end if
924
925#if HAVE_HIP
926 call hip_addcol3(a_d, b_d, c_d, n, strm_)
927#elif HAVE_CUDA
928 call cuda_addcol3(a_d, b_d, c_d, n, strm_)
929#elif HAVE_OPENCL
930 call opencl_addcol3(a_d, b_d, c_d, n, strm_)
931#else
932 call neko_error('No device backend configured')
933#endif
934 end subroutine device_addcol3
935
937 subroutine device_addcol4(a_d, b_d, c_d, d_d, n, strm)
938 type(c_ptr) :: a_d, b_d, c_d, d_d
939 integer :: n
940 type(c_ptr), optional :: strm
941 type(c_ptr) :: strm_
942
943 if (n .lt. 1) return
944
945 if (present(strm)) then
946 strm_ = strm
947 else
948 strm_ = glb_cmd_queue
949 end if
950
951#if HAVE_HIP
952 call hip_addcol4(a_d, b_d, c_d, d_d, n, strm_)
953#elif HAVE_CUDA
954 call cuda_addcol4(a_d, b_d, c_d, d_d, n, strm_)
955#elif HAVE_OPENCL
956 call opencl_addcol4(a_d, b_d, c_d, d_d, n, strm_)
957#else
958 call neko_error('No device backend configured')
959#endif
960 end subroutine device_addcol4
961
963 subroutine device_addcol3s2(a_d, b_d, c_d, s, n, strm)
964 type(c_ptr) :: a_d, b_d, c_d
965 real(kind=rp) :: s
966 integer :: n
967 type(c_ptr), optional :: strm
968 type(c_ptr) :: strm_
969
970 if (n .lt. 1) return
971
972 if (present(strm)) then
973 strm_ = strm
974 else
975 strm_ = glb_cmd_queue
976 end if
977
978#if HAVE_HIP
979 call hip_addcol3s2(a_d, b_d, c_d, s, n, strm_)
980#elif HAVE_CUDA
981 call cuda_addcol3s2(a_d, b_d, c_d, s, n, strm_)
982#elif HAVE_OPENCL
983 call opencl_addcol3s2(a_d, b_d, c_d, s, n, strm_)
984#else
985 call neko_error('No device backend configured')
986#endif
987 end subroutine device_addcol3s2
988
991 subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
992 type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d
993 integer :: n
994 type(c_ptr), optional :: strm
995 type(c_ptr) :: strm_
996
997 if (n .lt. 1) return
998
999 if (present(strm)) then
1000 strm_ = strm
1001 else
1002 strm_ = glb_cmd_queue
1003 end if
1004
1005#if HAVE_HIP
1006 call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1007#elif HAVE_CUDA
1008 call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1009#elif HAVE_OPENCL
1010 call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1011#else
1012 call neko_error('No device backend configured')
1013#endif
1014 end subroutine device_vdot3
1015
1018 subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1019 w1_d, w2_d, w3_d, n, strm)
1020 type(c_ptr) :: u1_d, u2_d, u3_d
1021 type(c_ptr) :: v1_d, v2_d, v3_d
1022 type(c_ptr) :: w1_d, w2_d, w3_d
1023 integer :: n
1024 type(c_ptr), optional :: strm
1025 type(c_ptr) :: strm_
1026
1027 if (n .lt. 1) return
1028
1029 if (present(strm)) then
1030 strm_ = strm
1031 else
1032 strm_ = glb_cmd_queue
1033 end if
1034
1035#if HAVE_HIP
1036 call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1037 w1_d, w2_d, w3_d, n, strm_)
1038#elif HAVE_CUDA
1039 call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1040 w1_d, w2_d, w3_d, n, strm_)
1041#elif HAVE_OPENCL
1042 call opencl_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1043 w1_d, w2_d, w3_d, n, strm_)
1044#else
1045 call neko_error('No device backend configured')
1046#endif
1047 end subroutine device_vcross
1048
1049
1051 function device_vlsc3(u_d, v_d, w_d, n, strm) result(res)
1052 type(c_ptr) :: u_d, v_d, w_d
1053 integer :: n
1054 type(c_ptr), optional :: strm
1055 type(c_ptr) :: strm_
1056 real(kind=rp) :: res
1057
1058 res = 0.0_rp
1059
1060 if (n .lt. 1) return
1061
1062 if (present(strm)) then
1063 strm_ = strm
1064 else
1065 strm_ = glb_cmd_queue
1066 end if
1067
1068#if HAVE_HIP
1069 res = hip_vlsc3(u_d, v_d, w_d, n, strm_)
1070#elif HAVE_CUDA
1071 res = cuda_vlsc3(u_d, v_d, w_d, n, strm_)
1072#elif HAVE_OPENCL
1073 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1074 res = opencl_glsc3(u_d, v_d, w_d, n, strm_)
1075#else
1076 call neko_error('No device backend configured')
1077#endif
1078 end function device_vlsc3
1079
1081 function device_glsc3(a_d, b_d, c_d, n, strm) result(res)
1082 type(c_ptr) :: a_d, b_d, c_d
1083 integer :: n, ierr
1084 type(c_ptr), optional :: strm
1085 type(c_ptr) :: strm_
1086 real(kind=rp) :: res
1087
1088 if (present(strm)) then
1089 strm_ = strm
1090 else
1091 strm_ = glb_cmd_queue
1092 end if
1093
1094 res = 0.0_rp
1095#if HAVE_HIP
1096 res = hip_glsc3(a_d, b_d, c_d, n, strm_)
1097#elif HAVE_CUDA
1098 res = cuda_glsc3(a_d, b_d, c_d, n, strm_)
1099#elif HAVE_OPENCL
1100 res = opencl_glsc3(a_d, b_d, c_d, n, strm_)
1101#else
1102 call neko_error('No device backend configured')
1103#endif
1104
1105#ifndef HAVE_DEVICE_MPI
1106 if (pe_size .gt. 1) then
1107 call mpi_allreduce(mpi_in_place, res, 1, &
1108 mpi_real_precision, mpi_sum, neko_comm, ierr)
1109 end if
1110#endif
1111 end function device_glsc3
1112
1113 subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
1114 type(c_ptr), value :: w_d, v_d_d, mult_d
1115 integer(c_int) :: j, n
1116 real(c_rp) :: h(j)
1117 type(c_ptr), optional :: strm
1118 type(c_ptr) :: strm_
1119 integer :: ierr
1120
1121 if (present(strm)) then
1122 strm_ = strm
1123 else
1124 strm_ = glb_cmd_queue
1125 end if
1126
1127#if HAVE_HIP
1128 call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1129#elif HAVE_CUDA
1130 call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1131#elif HAVE_OPENCL
1132 call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1133#else
1134 call neko_error('No device backend configured')
1135#endif
1136
1137#ifndef HAVE_DEVICE_MPI
1138 if (pe_size .gt. 1) then
1139 call mpi_allreduce(mpi_in_place, h, j, &
1140 mpi_real_precision, mpi_sum, neko_comm, ierr)
1141 end if
1142#endif
1143 end subroutine device_glsc3_many
1144
1145 subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
1146 type(c_ptr), value :: y_d, x_d_d, a_d
1147 integer(c_int) :: j, n
1148 type(c_ptr), optional :: strm
1149 type(c_ptr) :: strm_
1150
1151 if (n .lt. 1) return
1152
1153 if (present(strm)) then
1154 strm_ = strm
1155 else
1156 strm_ = glb_cmd_queue
1157 end if
1158
1159#if HAVE_HIP
1160 call hip_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1161#elif HAVE_CUDA
1162 call cuda_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1163#elif HAVE_OPENCL
1164 call opencl_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1165#else
1166 call neko_error('No device backend configured')
1167#endif
1168 end subroutine device_add2s2_many
1169
1171 function device_glsc2(a_d, b_d, n, strm) result(res)
1172 type(c_ptr) :: a_d, b_d
1173 integer :: n, ierr
1174 real(kind=rp) :: res
1175 type(c_ptr), optional :: strm
1176 type(c_ptr) :: strm_
1177
1178 if (present(strm)) then
1179 strm_ = strm
1180 else
1181 strm_ = glb_cmd_queue
1182 end if
1183
1184 res = 0.0_rp
1185#if HAVE_HIP
1186 res = hip_glsc2(a_d, b_d, n, strm_)
1187#elif HAVE_CUDA
1188 res = cuda_glsc2(a_d, b_d, n, strm_)
1189#elif HAVE_OPENCL
1190 res = opencl_glsc2(a_d, b_d, n, strm_)
1191#else
1192 call neko_error('No device backend configured')
1193#endif
1194
1195#ifndef HAVE_DEVICE_MPI
1196 if (pe_size .gt. 1) then
1197 call mpi_allreduce(mpi_in_place, res, 1, &
1198 mpi_real_precision, mpi_sum, neko_comm, ierr)
1199 end if
1200#endif
1201 end function device_glsc2
1202
1205 function device_glsubnorm(a_d, b_d, n, strm) result(res)
1206 type(c_ptr), intent(in) :: a_d, b_d
1207 integer, intent(in) :: n
1208 integer :: ierr
1209 real(kind=rp) :: res
1210 type(c_ptr), optional :: strm
1211 type(c_ptr) :: strm_
1212
1213 if (present(strm)) then
1214 strm_ = strm
1215 else
1216 strm_ = glb_cmd_queue
1217 end if
1218
1219 res = 0.0_rp
1220#if HAVE_HIP
1221 res = hip_glsubnorm2(a_d, b_d, n, strm_)
1222#elif HAVE_CUDA
1223 res = cuda_glsubnorm2(a_d, b_d, n, strm_)
1224#elif HAVE_OPENCL
1225 res = opencl_glsubnorm2(a_d, b_d, n, strm_)
1226#else
1227 call neko_error('No device backend configured')
1228#endif
1229
1230#ifndef HAVE_DEVICE_MPI
1231 if (pe_size .gt. 1) then
1232 call mpi_allreduce(mpi_in_place, res, 1, &
1233 mpi_real_precision, mpi_sum, neko_comm, ierr)
1234 end if
1235#endif
1236
1237 res = sqrt(res)
1238 end function device_glsubnorm
1239
1241 function device_glsum(a_d, n, strm) result(res)
1242 type(c_ptr) :: a_d
1243 integer :: n, ierr
1244 real(kind=rp) :: res
1245 type(c_ptr), optional :: strm
1246 type(c_ptr) :: strm_
1247
1248 if (present(strm)) then
1249 strm_ = strm
1250 else
1251 strm_ = glb_cmd_queue
1252 end if
1253
1254 res = 0.0_rp
1255#if HAVE_HIP
1256 res = hip_glsum(a_d, n, strm_)
1257#elif HAVE_CUDA
1258 res = cuda_glsum(a_d, n, strm_)
1259#elif HAVE_OPENCL
1260 res = opencl_glsum(a_d, n, strm_)
1261#else
1262 call neko_error('No device backend configured')
1263#endif
1264
1265#ifndef HAVE_DEVICE_MPI
1266 if (pe_size .gt. 1) then
1267 call mpi_allreduce(mpi_in_place, res, 1, &
1268 mpi_real_precision, mpi_sum, neko_comm, ierr)
1269 end if
1270#endif
1271 end function device_glsum
1272
1273 subroutine device_absval(a_d, n, strm)
1274 integer, intent(in) :: n
1275 type(c_ptr) :: a_d
1276 type(c_ptr), optional :: strm
1277 type(c_ptr) :: strm_
1278
1279 if (n .lt. 1) return
1280
1281 if (present(strm)) then
1282 strm_ = strm
1283 else
1284 strm_ = glb_cmd_queue
1285 end if
1286
1287#ifdef HAVE_HIP
1288 call hip_absval(a_d, n, strm_)
1289#elif HAVE_CUDA
1290 call cuda_absval(a_d, n, strm_)
1291#elif HAVE_OPENCL
1292 call neko_error('OPENCL is not implemented for device_absval')
1293#else
1294 call neko_error('No device backend configured')
1295#endif
1296
1297 end subroutine device_absval
1298
1299 ! ========================================================================== !
1300 ! Device point-wise max
1301
1304 subroutine device_pwmax2(a_d, b_d, n, strm)
1305 type(c_ptr) :: a_d, b_d
1306 integer :: n
1307 type(c_ptr), optional :: strm
1308 type(c_ptr) :: strm_
1309
1310 if (n .lt. 1) return
1311
1312 if (present(strm)) then
1313 strm_ = strm
1314 else
1315 strm_ = glb_cmd_queue
1316 end if
1317
1318#if HAVE_HIP
1319 call hip_pwmax_vec2(a_d, b_d, n, strm_)
1320#elif HAVE_CUDA
1321 call cuda_pwmax_vec2(a_d, b_d, n, strm_)
1322#elif HAVE_OPENCL
1323 call neko_error('No OpenCL backend for device_pwmax2')
1324#else
1325 call neko_error('No device backend configured')
1326#endif
1327 end subroutine device_pwmax2
1328
1331 subroutine device_pwmax3(a_d, b_d, c_d, n, strm)
1332 type(c_ptr) :: a_d, b_d, c_d
1333 integer :: n
1334 type(c_ptr), optional :: strm
1335 type(c_ptr) :: strm_
1336
1337 if (n .lt. 1) return
1338
1339 if (present(strm)) then
1340 strm_ = strm
1341 else
1342 strm_ = glb_cmd_queue
1343 end if
1344
1345#if HAVE_HIP
1346 call hip_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1347#elif HAVE_CUDA
1348 call cuda_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1349#elif HAVE_OPENCL
1350 call neko_error('No OpenCL backend for device_pwmax3')
1351#else
1352 call neko_error('No device backend configured')
1353#endif
1354
1355 end subroutine device_pwmax3
1356
1359 subroutine device_cpwmax2(a_d, c, n, strm)
1360 type(c_ptr) :: a_d
1361 real(kind=rp), intent(in) :: c
1362 integer :: n
1363 type(c_ptr), optional :: strm
1364 type(c_ptr) :: strm_
1365
1366 if (n .lt. 1) return
1367
1368 if (present(strm)) then
1369 strm_ = strm
1370 else
1371 strm_ = glb_cmd_queue
1372 end if
1373
1374#if HAVE_HIP
1375 call hip_pwmax_sca2(a_d, c, n, strm_)
1376#elif HAVE_CUDA
1377 call cuda_pwmax_sca2(a_d, c, n, strm_)
1378#elif HAVE_OPENCL
1379 call neko_error('No OpenCL backend for device_cpwmax2')
1380#else
1381 call neko_error('No device backend configured')
1382#endif
1383
1384 end subroutine device_cpwmax2
1385
1388 subroutine device_cpwmax3(a_d, b_d, c, n, strm)
1389 type(c_ptr) :: a_d, b_d
1390 real(kind=rp), intent(in) :: c
1391 integer :: n
1392 type(c_ptr), optional :: strm
1393 type(c_ptr) :: strm_
1394
1395 if (n .lt. 1) return
1396
1397 if (present(strm)) then
1398 strm_ = strm
1399 else
1400 strm_ = glb_cmd_queue
1401 end if
1402
1403#if HAVE_HIP
1404 call hip_pwmax_sca3(a_d, b_d, c, n, strm_)
1405#elif HAVE_CUDA
1406 call cuda_pwmax_sca3(a_d, b_d, c, n, strm_)
1407#elif HAVE_OPENCL
1408 call neko_error('No OpenCL backend for device_cpwmax3')
1409#else
1410 call neko_error('No device backend configured')
1411#endif
1412
1413 end subroutine device_cpwmax3
1414
1415 ! ========================================================================== !
1416 ! Device point-wise min
1417
1420 subroutine device_pwmin2(a_d, b_d, n, strm)
1421 type(c_ptr) :: a_d, b_d
1422 integer :: n
1423 type(c_ptr), optional :: strm
1424 type(c_ptr) :: strm_
1425
1426 if (n .lt. 1) return
1427
1428 if (present(strm)) then
1429 strm_ = strm
1430 else
1431 strm_ = glb_cmd_queue
1432 end if
1433
1434#if HAVE_HIP
1435 call hip_pwmin_vec2(a_d, b_d, n, strm_)
1436#elif HAVE_CUDA
1437 call cuda_pwmin_vec2(a_d, b_d, n, strm_)
1438#elif HAVE_OPENCL
1439 call neko_error('No OpenCL backend for device_pwmin2')
1440#else
1441 call neko_error('No device backend configured')
1442#endif
1443 end subroutine device_pwmin2
1444
1447 subroutine device_pwmin3(a_d, b_d, c_d, n, strm)
1448 type(c_ptr) :: a_d, b_d, c_d
1449 integer :: n
1450 type(c_ptr), optional :: strm
1451 type(c_ptr) :: strm_
1452
1453 if (n .lt. 1) return
1454
1455 if (present(strm)) then
1456 strm_ = strm
1457 else
1458 strm_ = glb_cmd_queue
1459 end if
1460
1461#if HAVE_HIP
1462 call hip_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1463#elif HAVE_CUDA
1464 call cuda_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1465#elif HAVE_OPENCL
1466 call neko_error('No OpenCL backend for device_pwmin3')
1467#else
1468 call neko_error('No device backend configured')
1469#endif
1470
1471 end subroutine device_pwmin3
1472
1475 subroutine device_cpwmin2(a_d, c, n, strm)
1476 type(c_ptr) :: a_d
1477 real(kind=rp), intent(in) :: c
1478 integer :: n
1479 type(c_ptr), optional :: strm
1480 type(c_ptr) :: strm_
1481
1482 if (n .lt. 1) return
1483
1484 if (present(strm)) then
1485 strm_ = strm
1486 else
1487 strm_ = glb_cmd_queue
1488 end if
1489
1490#if HAVE_HIP
1491 call hip_pwmin_sca2(a_d, c, n, strm_)
1492#elif HAVE_CUDA
1493 call cuda_pwmin_sca2(a_d, c, n, strm_)
1494#elif HAVE_OPENCL
1495 call neko_error('No OpenCL backend for device_cpwmin2')
1496#else
1497 call neko_error('No device backend configured')
1498#endif
1499
1500 end subroutine device_cpwmin2
1501
1504 subroutine device_cpwmin3(a_d, b_d, c, n, strm)
1505 type(c_ptr) :: a_d, b_d
1506 real(kind=rp), intent(in) :: c
1507 integer :: n
1508 type(c_ptr), optional :: strm
1509 type(c_ptr) :: strm_
1510
1511 if (n .lt. 1) return
1512
1513 if (present(strm)) then
1514 strm_ = strm
1515 else
1516 strm_ = glb_cmd_queue
1517 end if
1518
1519#if HAVE_HIP
1520 call hip_pwmin_sca3(a_d, b_d, c, n, strm_)
1521#elif HAVE_CUDA
1522 call cuda_pwmin_sca3(a_d, b_d, c, n, strm_)
1523#elif HAVE_OPENCL
1524 call neko_error('No OpenCL backend for device_cpwmin3')
1525#else
1526 call neko_error('No device backend configured')
1527#endif
1528
1529 end subroutine device_cpwmin3
1530
1531 ! ========================================================================== !
1532 ! Integer operations
1533
1535 subroutine device_iadd(a_d, c, n, strm)
1536 type(c_ptr), intent(inout) :: a_d
1537 integer, intent(in) :: c
1538 integer, intent(in) :: n
1539 type(c_ptr), optional :: strm
1540 type(c_ptr) :: strm_
1541 if (n .lt. 1) return
1542
1543 if (present(strm)) then
1544 strm_ = strm
1545 else
1546 strm_ = glb_cmd_queue
1547 end if
1548
1549#if HAVE_HIP
1550 call hip_iadd(a_d, c, n, strm_)
1551#elif HAVE_CUDA
1552 call cuda_iadd(a_d, c, n, strm_)
1553#elif HAVE_OPENCL
1554 call opencl_iadd(a_d, c, n, strm_)
1555#else
1556 call neko_error('No device backend configured')
1557#endif
1558 end subroutine device_iadd
1559
1560
1561end module device_math
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:50
integer, public pe_size
MPI size of communicator.
Definition comm.F90:58
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:42
subroutine, public device_pwmin2(a_d, b_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
subroutine, public device_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm)
Returns .
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_addcol3(a_d, b_d, c_d, n, strm)
Returns .
real(kind=rp) function, public device_glsum(a_d, n, strm)
Sum a vector of length n.
subroutine, public device_pwmax3(a_d, b_d, c_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine, public device_invcol1(a_d, n, strm)
Invert a vector .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
subroutine, public device_cpwmax3(a_d, b_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_rone(a_d, n, strm)
Set all elements to one.
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm)
Compute a cross product (3-d version) assuming vector components etc.
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
real(kind=rp) function, public device_glsubnorm(a_d, b_d, n, strm)
Returns the norm of the difference of two vectors .
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
subroutine, public device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
Fill a constant to a masked vector. .
subroutine, public device_cadd2(a_d, b_d, c, n, strm)
Add a scalar to vector .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
subroutine, public device_invcol3(a_d, b_d, c_d, n, strm)
Vector division .
subroutine, public device_pwmin3(a_d, b_d, c_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_col2(a_d, b_d, n, strm)
Vector multiplication .
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n, strm)
Compute multiplication sum .
subroutine, public device_cdiv2(a_d, b_d, c, n, strm)
Division of constant c by array .
subroutine, public device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm)
Returns .
subroutine, public device_add4(a_d, b_d, c_d, d_d, n, strm)
subroutine, public device_cpwmax2(a_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_subcol3(a_d, b_d, c_d, n, strm)
Returns .
subroutine, public device_cdiv(a_d, c, n, strm)
Division of constant c by array .
subroutine, public device_absval(a_d, n, strm)
subroutine device_iadd(a_d, c, n, strm)
Add an integer scalar to vector .
subroutine, public device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
subroutine, public device_invcol2(a_d, b_d, n, strm)
Vector division .
subroutine, public device_addsqr2s2(a_d, b_d, c1, n, strm)
Returns .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cpwmin3(a_d, b_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_col3(a_d, b_d, c_d, n, strm)
Vector multiplication with 3 vectors .
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n, strm)
Returns .
subroutine, public device_cfill(a_d, c, n, strm)
Set all elements to a constant c .
subroutine, public device_cpwmin2(a_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_add3(a_d, b_d, c_d, n, strm)
Vector addition .
subroutine, public device_addcol3s2(a_d, b_d, c_d, s, n, strm)
Returns .
subroutine, public device_pwmax2(a_d, b_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine device_radd(a_d, c, n, strm)
Add a scalar to vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35