Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
device_math.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
34 use, intrinsic :: iso_c_binding, only: c_ptr, c_int
35 use num_types, only : rp, c_rp
36 use utils, only : neko_error
38 use mpi_f08, only : mpi_sum, mpi_in_place, mpi_allreduce
39 use device, only : glb_cmd_queue
40 ! ========================================================================== !
41 ! Device math interfaces
42
43 use hip_math
44 use cuda_math
45 use opencl_math
46
47 implicit none
48 private
49
50 interface device_cadd
51 module procedure device_radd, device_iadd
52 end interface device_cadd
53
69
70contains
71
73 subroutine device_copy(a_d, b_d, n, strm)
74 type(c_ptr) :: a_d, b_d
75 integer :: n
76 type(c_ptr), optional :: strm
77 type(c_ptr) :: strm_
78
79 if (n .lt. 1) return
80
81 if (present(strm)) then
82 strm_ = strm
83 else
84 strm_ = glb_cmd_queue
85 end if
86
87#if HAVE_HIP
88 call hip_copy(a_d, b_d, n, strm_)
89#elif HAVE_CUDA
90 call cuda_copy(a_d, b_d, n, strm_)
91#elif HAVE_OPENCL
92 call opencl_copy(a_d, b_d, n, strm_)
93#else
94 call neko_error('no device backend configured')
95#endif
96 end subroutine device_copy
97
99 subroutine device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
100 type(c_ptr) :: a_d, b_d, mask_d
101 integer :: n, n_mask
102 type(c_ptr), optional :: strm
103 type(c_ptr) :: strm_
104
105 if (n .lt. 1 .or. n_mask .lt. 1) return
106
107 if (present(strm)) then
108 strm_ = strm
109 else
110 strm_ = glb_cmd_queue
111 end if
112
113#if HAVE_HIP
114 call hip_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
115#elif HAVE_CUDA
116 call cuda_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
117#elif HAVE_OPENCL
118 call opencl_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
119#else
120 call neko_error('no device backend configured')
121#endif
122 end subroutine device_masked_copy_0
123
125 subroutine device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
126 type(c_ptr) :: a_d, b_d, mask_d
127 integer :: n, n_mask
128 type(c_ptr), optional :: strm
129 type(c_ptr) :: strm_
130
131 if (n .lt. 1 .or. n_mask .lt. 1) return
132
133 if (present(strm)) then
134 strm_ = strm
135 else
136 strm_ = glb_cmd_queue
137 end if
138
139#if HAVE_HIP
140 call hip_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
141#elif HAVE_CUDA
142 call cuda_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
143#elif HAVE_OPENCL
144 call opencl_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
145#else
146 call neko_error('no device backend configured')
147#endif
148 end subroutine device_masked_gather_copy_0
149
151 ! In this case, the mask comes from a mask_t type
152 subroutine device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
153 type(c_ptr) :: a_d, b_d, mask_d
154 integer :: n, n_mask
155 type(c_ptr), optional :: strm
156 type(c_ptr) :: strm_
157
158 if (n .lt. 1 .or. n_mask .lt. 1) return
159
160 if (present(strm)) then
161 strm_ = strm
162 else
163 strm_ = glb_cmd_queue
164 end if
165
166#if HAVE_HIP
167 call hip_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
168#elif HAVE_CUDA
169 call cuda_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
170#elif HAVE_OPENCL
171 call opencl_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
172#else
173 call neko_error('no device backend configured')
174#endif
176
178 subroutine device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
179 type(c_ptr) :: a_d, b_d, mask_d
180 integer :: n, n_mask
181 type(c_ptr), optional :: strm
182 type(c_ptr) :: strm_
183
184 if (n .lt. 1 .or. n_mask .lt. 1) return
185
186 if (present(strm)) then
187 strm_ = strm
188 else
189 strm_ = glb_cmd_queue
190 end if
191
192#if HAVE_HIP
193 call hip_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
194#elif HAVE_CUDA
195 call cuda_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
196#elif HAVE_OPENCL
197 call opencl_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
198#else
199 call neko_error('no device backend configured')
200#endif
201 end subroutine device_masked_scatter_copy_0
202
203 subroutine device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
204 type(c_ptr) :: a_d, b_d, mask_d
205 integer :: n, n_mask
206 type(c_ptr), optional :: strm
207 type(c_ptr) :: strm_
208
209 if (n .lt. 1 .or. n_mask .lt. 1) return
210
211 if (present(strm)) then
212 strm_ = strm
213 else
214 strm_ = glb_cmd_queue
215 end if
216
217#if HAVE_HIP
218 call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
219#elif HAVE_CUDA
220 call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
221#elif HAVE_OPENCL
222 call neko_error('No OpenCL bcknd, masked atomic reduction')
223#else
224 call neko_error('no device backend configured')
225#endif
227
230 subroutine device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
231 type(c_ptr) :: a_d
232 real(kind=rp), intent(in) :: c
233 integer :: n
234 type(c_ptr) :: mask_d
235 integer :: n_mask
236 type(c_ptr), optional :: strm
237 type(c_ptr) :: strm_
238
239 if (n .lt. 1 .or. n_mask .lt. 1) return
240
241 if (present(strm)) then
242 strm_ = strm
243 else
244 strm_ = glb_cmd_queue
245 end if
246
247#if HAVE_HIP
248 call hip_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
249#elif HAVE_CUDA
250 call cuda_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
251#elif HAVE_OPENCL
252 call opencl_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
253#else
254 call neko_error('No device backend configured')
255#endif
256 end subroutine device_cfill_mask
257
259 subroutine device_rzero(a_d, n, strm)
260 type(c_ptr) :: a_d
261 integer :: n
262 type(c_ptr), optional :: strm
263 type(c_ptr) :: strm_
264
265 if (n .lt. 1) return
266
267 if (present(strm)) then
268 strm_ = strm
269 else
270 strm_ = glb_cmd_queue
271 end if
272
273#if HAVE_HIP
274 call hip_rzero(a_d, n, strm_)
275#elif HAVE_CUDA
276 call cuda_rzero(a_d, n, strm_)
277#elif HAVE_OPENCL
278 call opencl_rzero(a_d, n, strm_)
279#else
280 call neko_error('No device backend configured')
281#endif
282 end subroutine device_rzero
283
285 subroutine device_rone(a_d, n, strm)
286 type(c_ptr) :: a_d
287 integer :: n
288 type(c_ptr), optional :: strm
289 type(c_ptr) :: strm_
290 real(kind=rp), parameter :: one = 1.0_rp
291
292 if (n .lt. 1) return
293
294 if (present(strm)) then
295 strm_ = strm
296 else
297 strm_ = glb_cmd_queue
298 end if
299
300#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL
301 call device_cfill(a_d, one, n, strm_)
302#else
303 call neko_error('No device backend configured')
304#endif
305 end subroutine device_rone
306
308 subroutine device_cmult(a_d, c, n, strm)
309 type(c_ptr) :: a_d
310 real(kind=rp), intent(in) :: c
311 integer :: n
312 type(c_ptr), optional :: strm
313 type(c_ptr) :: strm_
314
315 if (n .lt. 1) return
316
317 if (present(strm)) then
318 strm_ = strm
319 else
320 strm_ = glb_cmd_queue
321 end if
322
323#if HAVE_HIP
324 call hip_cmult(a_d, c, n, strm_)
325#elif HAVE_CUDA
326 call cuda_cmult(a_d, c, n, strm_)
327#elif HAVE_OPENCL
328 call opencl_cmult(a_d, c, n, strm_)
329#else
330 call neko_error('No device backend configured')
331#endif
332 end subroutine device_cmult
333
335 subroutine device_cmult2(a_d, b_d, c, n, strm)
336 type(c_ptr) :: a_d, b_d
337 real(kind=rp), intent(in) :: c
338 integer :: n
339 type(c_ptr), optional :: strm
340 type(c_ptr) :: strm_
341
342 if (n .lt. 1) return
343
344 if (present(strm)) then
345 strm_ = strm
346 else
347 strm_ = glb_cmd_queue
348 end if
349
350#if HAVE_HIP
351 call hip_cmult2(a_d, b_d, c, n, strm_)
352#elif HAVE_CUDA
353 call cuda_cmult2(a_d, b_d, c, n, strm_)
354#elif HAVE_OPENCL
355 call opencl_cmult2(a_d, b_d, c, n, strm_)
356#else
357 call neko_error('No device backend configured')
358#endif
359 end subroutine device_cmult2
360
362 subroutine device_cdiv(a_d, c, n, strm)
363 type(c_ptr) :: a_d
364 real(kind=rp), intent(in) :: c
365 integer :: n
366 type(c_ptr), optional :: strm
367 type(c_ptr) :: strm_
368
369 if (present(strm)) then
370 strm_ = strm
371 else
372 strm_ = glb_cmd_queue
373 end if
374
375#if HAVE_HIP
376 call hip_cdiv(a_d, c, n, strm_)
377#elif HAVE_CUDA
378 call cuda_cdiv(a_d, c, n, strm_)
379#elif HAVE_OPENCL
380 call opencl_cdiv(a_d, c, n, strm_)
381#else
382 call neko_error('No device backend configured')
383#endif
384 end subroutine device_cdiv
385
387 subroutine device_cdiv2(a_d, b_d, c, n, strm)
388 type(c_ptr) :: a_d, b_d
389 real(kind=rp), intent(in) :: c
390 integer :: n
391 type(c_ptr), optional :: strm
392 type(c_ptr) :: strm_
393
394 if (present(strm)) then
395 strm_ = strm
396 else
397 strm_ = glb_cmd_queue
398 end if
399
400#if HAVE_HIP
401 call hip_cdiv2(a_d, b_d, c, n, strm_)
402#elif HAVE_CUDA
403 call cuda_cdiv2(a_d, b_d, c, n, strm_)
404#elif HAVE_OPENCL
405 call opencl_cdiv2(a_d, b_d, c, n, strm_)
406#else
407 call neko_error('No device backend configured')
408#endif
409 end subroutine device_cdiv2
410
412 subroutine device_radd(a_d, c, n, strm)
413 type(c_ptr) :: a_d
414 real(kind=rp), intent(in) :: c
415 integer :: n
416 type(c_ptr), optional :: strm
417 type(c_ptr) :: strm_
418
419 if (n .lt. 1) return
420
421 if (present(strm)) then
422 strm_ = strm
423 else
424 strm_ = glb_cmd_queue
425 end if
426
427#if HAVE_HIP
428 call hip_radd(a_d, c, n, strm_)
429#elif HAVE_CUDA
430 call cuda_radd(a_d, c, n, strm_)
431#elif HAVE_OPENCL
432 call opencl_radd(a_d, c, n, strm_)
433#else
434 call neko_error('No device backend configured')
435#endif
436 end subroutine device_radd
437
439 subroutine device_cadd2(a_d, b_d, c, n, strm)
440 type(c_ptr) :: a_d
441 type(c_ptr) :: b_d
442 real(kind=rp), intent(in) :: c
443 integer :: n
444 type(c_ptr), optional :: strm
445 type(c_ptr) :: strm_
446
447 if (n .lt. 1) return
448
449 if (present(strm)) then
450 strm_ = strm
451 else
452 strm_ = glb_cmd_queue
453 end if
454
455#if HAVE_HIP
456 call hip_cadd2(a_d, b_d, c, n, strm_)
457#elif HAVE_CUDA
458 call cuda_cadd2(a_d, b_d, c, n, strm_)
459#elif HAVE_OPENCL
460 call opencl_cadd2(a_d, b_d, c, n, strm_)
461#else
462 call neko_error('No device backend configured')
463#endif
464 end subroutine device_cadd2
465
467 subroutine device_cfill(a_d, c, n, strm)
468 type(c_ptr) :: a_d
469 real(kind=rp), intent(in) :: c
470 integer :: n
471 type(c_ptr), optional ::strm
472 type(c_ptr) :: strm_
473
474 if (n .lt. 1) return
475
476 if (present(strm)) then
477 strm_ = strm
478 else
479 strm_ = glb_cmd_queue
480 end if
481
482#if HAVE_HIP
483 call hip_cfill(a_d, c, n, strm_)
484#elif HAVE_CUDA
485 call cuda_cfill(a_d, c, n, strm_)
486#elif HAVE_OPENCL
487 call opencl_cfill(a_d, c, n, strm_)
488#else
489 call neko_error('No device backend configured')
490#endif
491 end subroutine device_cfill
492
494 subroutine device_add2(a_d, b_d, n, strm)
495 type(c_ptr) :: a_d, b_d
496 integer :: n
497 type(c_ptr), optional :: strm
498 type(c_ptr) :: strm_
499
500 if (n .lt. 1) return
501
502 if (present(strm)) then
503 strm_ = strm
504 else
505 strm_ = glb_cmd_queue
506 end if
507
508#if HAVE_HIP
509 call hip_add2(a_d, b_d, n, strm_)
510#elif HAVE_CUDA
511 call cuda_add2(a_d, b_d, n, strm_)
512#elif HAVE_OPENCL
513 call opencl_add2(a_d, b_d, n, strm_)
514#else
515 call neko_error('No device backend configured')
516#endif
517 end subroutine device_add2
518
519 subroutine device_add4(a_d, b_d, c_d, d_d, n, strm)
520 type(c_ptr) :: a_d, b_d, c_d, d_d
521 integer :: n
522 type(c_ptr), optional :: strm
523 type(c_ptr) :: strm_
524
525 if (n .lt. 1) return
526
527 if (present(strm)) then
528 strm_ = strm
529 else
530 strm_ = glb_cmd_queue
531 end if
532
533#if HAVE_HIP
534 call hip_add4(a_d, b_d, c_d, d_d, n, strm_)
535#elif HAVE_CUDA
536 call cuda_add4(a_d, b_d, c_d, d_d, n, strm_)
537#elif HAVE_OPENCL
538 call opencl_add4(a_d, b_d, c_d, d_d, n, strm_)
539#else
540 call neko_error('No device backend configured')
541#endif
542 end subroutine device_add4
543
544 subroutine device_add2s1(a_d, b_d, c1, n, strm)
545 type(c_ptr) :: a_d, b_d
546 real(kind=rp) :: c1
547 integer :: n
548 type(c_ptr), optional :: strm
549 type(c_ptr) :: strm_
550
551 if (n .lt. 1) return
552
553 if (present(strm)) then
554 strm_ = strm
555 else
556 strm_ = glb_cmd_queue
557 end if
558
559#if HAVE_HIP
560 call hip_add2s1(a_d, b_d, c1, n, strm_)
561#elif HAVE_CUDA
562 call cuda_add2s1(a_d, b_d, c1, n, strm_)
563#elif HAVE_OPENCL
564 call opencl_add2s1(a_d, b_d, c1, n, strm_)
565#else
566 call neko_error('No device backend configured')
567#endif
568 end subroutine device_add2s1
569
572 subroutine device_add2s2(a_d, b_d, c1, n, strm)
573 type(c_ptr) :: a_d, b_d
574 real(kind=rp) :: c1
575 integer :: n
576 type(c_ptr), optional :: strm
577 type(c_ptr) :: strm_
578
579 if (n .lt. 1) return
580
581 if (present(strm)) then
582 strm_ = strm
583 else
584 strm_ = glb_cmd_queue
585 end if
586
587#if HAVE_HIP
588 call hip_add2s2(a_d, b_d, c1, n, strm_)
589#elif HAVE_CUDA
590 call cuda_add2s2(a_d, b_d, c1, n, strm_)
591#elif HAVE_OPENCL
592 call opencl_add2s2(a_d, b_d, c1, n, strm_)
593#else
594 call neko_error('No device backend configured')
595#endif
596 end subroutine device_add2s2
597
599 subroutine device_addsqr2s2(a_d, b_d, c1, n, strm)
600 type(c_ptr) :: a_d, b_d
601 real(kind=rp) :: c1
602 integer :: n
603 type(c_ptr), optional :: strm
604 type(c_ptr) :: strm_
605
606 if (n .lt. 1) return
607
608 if (present(strm)) then
609 strm_ = strm
610 else
611 strm_ = glb_cmd_queue
612 end if
613
614#if HAVE_HIP
615 call hip_addsqr2s2(a_d, b_d, c1, n, strm_)
616#elif HAVE_CUDA
617 call cuda_addsqr2s2(a_d, b_d, c1, n, strm_)
618#elif HAVE_OPENCL
619 call opencl_addsqr2s2(a_d, b_d, c1, n, strm_)
620#else
621 call neko_error('No device backend configured')
622#endif
623 end subroutine device_addsqr2s2
624
626 subroutine device_add3(a_d, b_d, c_d, n, strm)
627 type(c_ptr) :: a_d, b_d, c_d
628 integer :: n
629 type(c_ptr), optional :: strm
630 type(c_ptr) :: strm_
631
632 if (n .lt. 1) return
633
634 if (present(strm)) then
635 strm_ = strm
636 else
637 strm_ = glb_cmd_queue
638 end if
639
640#if HAVE_HIP
641 call hip_add3(a_d, b_d, c_d, n, strm_)
642#elif HAVE_CUDA
643 call cuda_add3(a_d, b_d, c_d, n, strm_)
644#elif HAVE_OPENCL
645 call opencl_add3(a_d, b_d, c_d, n, strm_)
646#else
647 call neko_error('No device backend configured')
648#endif
649 end subroutine device_add3
650
652 subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n, strm)
653 type(c_ptr) :: a_d, b_d, c_d
654 real(kind=rp) :: c1, c2
655 integer :: n
656 type(c_ptr), optional :: strm
657 type(c_ptr) :: strm_
658
659 if (n .lt. 1) return
660
661 if (present(strm)) then
662 strm_ = strm
663 else
664 strm_ = glb_cmd_queue
665 end if
666
667#if HAVE_HIP
668 call hip_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
669#elif HAVE_CUDA
670 call cuda_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
671#elif HAVE_OPENCL
672 call opencl_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
673#else
674 call neko_error('No device backend configured')
675#endif
676 end subroutine device_add3s2
677
679 subroutine device_add4s3(a_d, b_d, c_d, d_d, c1, c2 , c3, n, strm)
680 type(c_ptr) :: a_d, b_d, c_d, d_d
681 real(kind=rp) :: c1, c2, c3
682 integer :: n
683 type(c_ptr), optional :: strm
684 type(c_ptr) :: strm_
685
686 if (n .lt. 1) return
687
688 if (present(strm)) then
689 strm_ = strm
690 else
691 strm_ = glb_cmd_queue
692 end if
693
694#if HAVE_HIP
695 call hip_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
696#elif HAVE_CUDA
697 call cuda_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
698#elif HAVE_OPENCL
699 call opencl_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
700#else
701 call neko_error('No device backend configured')
702#endif
703 end subroutine device_add4s3
704
706 subroutine device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2 , c3, c4, n, strm)
707 type(c_ptr) :: a_d, b_d, c_d, d_d, e_d
708 real(kind=rp) :: c1, c2, c3, c4
709 integer :: n
710 type(c_ptr), optional :: strm
711 type(c_ptr) :: strm_
712
713 if (n .lt. 1) return
714
715 if (present(strm)) then
716 strm_ = strm
717 else
718 strm_ = glb_cmd_queue
719 end if
720
721#if HAVE_HIP
722 call hip_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
723#elif HAVE_CUDA
724 call cuda_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
725#elif HAVE_OPENCL
726 call opencl_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
727#else
728 call neko_error('No device backend configured')
729#endif
730 end subroutine device_add5s4
731
733 subroutine device_invcol1(a_d, n, strm)
734 type(c_ptr) :: a_d
735 integer :: n
736 type(c_ptr), optional :: strm
737 type(c_ptr) :: strm_
738
739 if (n .lt. 1) return
740
741 if (present(strm)) then
742 strm_ = strm
743 else
744 strm_ = glb_cmd_queue
745 end if
746
747#if HAVE_HIP
748 call hip_invcol1(a_d, n, strm_)
749#elif HAVE_CUDA
750 call cuda_invcol1(a_d, n, strm_)
751#elif HAVE_OPENCL
752 call opencl_invcol1(a_d, n, strm_)
753#else
754 call neko_error('No device backend configured')
755#endif
756 end subroutine device_invcol1
757
759 subroutine device_invcol2(a_d, b_d, n, strm)
760 type(c_ptr) :: a_d, b_d
761 integer :: n
762 type(c_ptr), optional :: strm
763 type(c_ptr) :: strm_
764
765 if (n .lt. 1) return
766
767 if (present(strm)) then
768 strm_ = strm
769 else
770 strm_ = glb_cmd_queue
771 end if
772
773#if HAVE_HIP
774 call hip_invcol2(a_d, b_d, n, strm_)
775#elif HAVE_CUDA
776 call cuda_invcol2(a_d, b_d, n, strm_)
777#elif HAVE_OPENCL
778 call opencl_invcol2(a_d, b_d, n, strm_)
779#else
780 call neko_error('No device backend configured')
781#endif
782 end subroutine device_invcol2
783
785 subroutine device_invcol3(a_d, b_d, c_d, n, strm)
786 type(c_ptr) :: a_d, b_d, c_d
787 integer :: n
788 type(c_ptr), optional :: strm
789 type(c_ptr) :: strm_
790
791 if (present(strm)) then
792 strm_ = strm
793 else
794 strm_ = glb_cmd_queue
795 end if
796
797#ifdef HAVE_HIP
798 call hip_invcol3(a_d, b_d, c_d, n, strm_)
799#elif HAVE_CUDA
800 call cuda_invcol3(a_d, b_d, c_d, n, strm_)
801#elif HAVE_OPENCL
802 ! call opencl_invcol3(a_d, b_d, c_d, n)
803 call neko_error('opencl_invcol3 not implemented')
804#else
805 call neko_error('No device backend configured')
806#endif
807 end subroutine device_invcol3
808
810 subroutine device_col2(a_d, b_d, n, strm)
811 type(c_ptr) :: a_d, b_d
812 integer :: n
813 type(c_ptr), optional :: strm
814 type(c_ptr) :: strm_
815
816 if (present(strm)) then
817 strm_ = strm
818 else
819 strm_ = glb_cmd_queue
820 end if
821
822 if (n .lt. 1) return
823#if HAVE_HIP
824 call hip_col2(a_d, b_d, n, strm_)
825#elif HAVE_CUDA
826 call cuda_col2(a_d, b_d, n, strm_)
827#elif HAVE_OPENCL
828 call opencl_col2(a_d, b_d, n, strm_)
829#else
830 call neko_error('No device backend configured')
831#endif
832 end subroutine device_col2
833
835 subroutine device_col3(a_d, b_d, c_d, n, strm)
836 type(c_ptr) :: a_d, b_d, c_d
837 integer :: n
838 type(c_ptr), optional :: strm
839 type(c_ptr) :: strm_
840
841 if (n .lt. 1) return
842
843 if (present(strm)) then
844 strm_ = strm
845 else
846 strm_ = glb_cmd_queue
847 end if
848
849#if HAVE_HIP
850 call hip_col3(a_d, b_d, c_d, n, strm_)
851#elif HAVE_CUDA
852 call cuda_col3(a_d, b_d, c_d, n, strm_)
853#elif HAVE_OPENCL
854 call opencl_col3(a_d, b_d, c_d, n, strm_)
855#else
856 call neko_error('No device backend configured')
857#endif
858 end subroutine device_col3
859
861 subroutine device_subcol3(a_d, b_d, c_d, n, strm)
862 type(c_ptr) :: a_d, b_d, c_d
863 integer :: n
864 type(c_ptr), optional :: strm
865 type(c_ptr) :: strm_
866
867 if (n .lt. 1) return
868
869 if (present(strm)) then
870 strm_ = strm
871 else
872 strm_ = glb_cmd_queue
873 end if
874
875#if HAVE_HIP
876 call hip_subcol3(a_d, b_d, c_d, n, strm_)
877#elif HAVE_CUDA
878 call cuda_subcol3(a_d, b_d, c_d, n, strm_)
879#elif HAVE_OPENCL
880 call opencl_subcol3(a_d, b_d, c_d, n, strm_)
881#else
882 call neko_error('No device backend configured')
883#endif
884 end subroutine device_subcol3
885
887 subroutine device_sub2(a_d, b_d, n, strm)
888 type(c_ptr) :: a_d, b_d
889 integer :: n
890 type(c_ptr), optional :: strm
891 type(c_ptr) :: strm_
892
893 if (n .lt. 1) return
894
895 if (present(strm)) then
896 strm_ = strm
897 else
898 strm_ = glb_cmd_queue
899 end if
900
901#if HAVE_HIP
902 call hip_sub2(a_d, b_d, n, strm_)
903#elif HAVE_CUDA
904 call cuda_sub2(a_d, b_d, n, strm_)
905#elif HAVE_OPENCL
906 call opencl_sub2(a_d, b_d, n, strm_)
907#else
908 call neko_error('No device backend configured')
909#endif
910 end subroutine device_sub2
911
913 subroutine device_sub3(a_d, b_d, c_d, n, strm)
914 type(c_ptr) :: a_d, b_d, c_d
915 integer :: n
916 type(c_ptr), optional :: strm
917 type(c_ptr) :: strm_
918
919 if (n .lt. 1) return
920
921 if (present(strm)) then
922 strm_ = strm
923 else
924 strm_ = glb_cmd_queue
925 end if
926
927#if HAVE_HIP
928 call hip_sub3(a_d, b_d, c_d, n, strm_)
929#elif HAVE_CUDA
930 call cuda_sub3(a_d, b_d, c_d, n, strm_)
931#elif HAVE_OPENCL
932 call opencl_sub3(a_d, b_d, c_d, n, strm_)
933#else
934 call neko_error('No device backend configured')
935#endif
936 end subroutine device_sub3
937
939 subroutine device_addcol3(a_d, b_d, c_d, n, strm)
940 type(c_ptr) :: a_d, b_d, c_d
941 integer :: n
942 type(c_ptr), optional :: strm
943 type(c_ptr) :: strm_
944
945 if (n .lt. 1) return
946
947 if (present(strm)) then
948 strm_ = strm
949 else
950 strm_ = glb_cmd_queue
951 end if
952
953#if HAVE_HIP
954 call hip_addcol3(a_d, b_d, c_d, n, strm_)
955#elif HAVE_CUDA
956 call cuda_addcol3(a_d, b_d, c_d, n, strm_)
957#elif HAVE_OPENCL
958 call opencl_addcol3(a_d, b_d, c_d, n, strm_)
959#else
960 call neko_error('No device backend configured')
961#endif
962 end subroutine device_addcol3
963
965 subroutine device_addcol4(a_d, b_d, c_d, d_d, n, strm)
966 type(c_ptr) :: a_d, b_d, c_d, d_d
967 integer :: n
968 type(c_ptr), optional :: strm
969 type(c_ptr) :: strm_
970
971 if (n .lt. 1) return
972
973 if (present(strm)) then
974 strm_ = strm
975 else
976 strm_ = glb_cmd_queue
977 end if
978
979#if HAVE_HIP
980 call hip_addcol4(a_d, b_d, c_d, d_d, n, strm_)
981#elif HAVE_CUDA
982 call cuda_addcol4(a_d, b_d, c_d, d_d, n, strm_)
983#elif HAVE_OPENCL
984 call opencl_addcol4(a_d, b_d, c_d, d_d, n, strm_)
985#else
986 call neko_error('No device backend configured')
987#endif
988 end subroutine device_addcol4
989
991 subroutine device_addcol3s2(a_d, b_d, c_d, s, n, strm)
992 type(c_ptr) :: a_d, b_d, c_d
993 real(kind=rp) :: s
994 integer :: n
995 type(c_ptr), optional :: strm
996 type(c_ptr) :: strm_
997
998 if (n .lt. 1) return
999
1000 if (present(strm)) then
1001 strm_ = strm
1002 else
1003 strm_ = glb_cmd_queue
1004 end if
1005
1006#if HAVE_HIP
1007 call hip_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1008#elif HAVE_CUDA
1009 call cuda_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1010#elif HAVE_OPENCL
1011 call opencl_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1012#else
1013 call neko_error('No device backend configured')
1014#endif
1015 end subroutine device_addcol3s2
1016
1019 subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
1020 type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d
1021 integer :: n
1022 type(c_ptr), optional :: strm
1023 type(c_ptr) :: strm_
1024
1025 if (n .lt. 1) return
1026
1027 if (present(strm)) then
1028 strm_ = strm
1029 else
1030 strm_ = glb_cmd_queue
1031 end if
1032
1033#if HAVE_HIP
1034 call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1035#elif HAVE_CUDA
1036 call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1037#elif HAVE_OPENCL
1038 call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1039#else
1040 call neko_error('No device backend configured')
1041#endif
1042 end subroutine device_vdot3
1043
1046 subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1047 w1_d, w2_d, w3_d, n, strm)
1048 type(c_ptr) :: u1_d, u2_d, u3_d
1049 type(c_ptr) :: v1_d, v2_d, v3_d
1050 type(c_ptr) :: w1_d, w2_d, w3_d
1051 integer :: n
1052 type(c_ptr), optional :: strm
1053 type(c_ptr) :: strm_
1054
1055 if (n .lt. 1) return
1056
1057 if (present(strm)) then
1058 strm_ = strm
1059 else
1060 strm_ = glb_cmd_queue
1061 end if
1062
1063#if HAVE_HIP
1064 call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1065 w1_d, w2_d, w3_d, n, strm_)
1066#elif HAVE_CUDA
1067 call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1068 w1_d, w2_d, w3_d, n, strm_)
1069#elif HAVE_OPENCL
1070 call opencl_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1071 w1_d, w2_d, w3_d, n, strm_)
1072#else
1073 call neko_error('No device backend configured')
1074#endif
1075 end subroutine device_vcross
1076
1077
1079 function device_vlsc3(u_d, v_d, w_d, n, strm) result(res)
1080 type(c_ptr) :: u_d, v_d, w_d
1081 integer :: n
1082 type(c_ptr), optional :: strm
1083 type(c_ptr) :: strm_
1084 real(kind=rp) :: res
1085
1086 res = 0.0_rp
1087
1088 if (n .lt. 1) return
1089
1090 if (present(strm)) then
1091 strm_ = strm
1092 else
1093 strm_ = glb_cmd_queue
1094 end if
1095
1096#if HAVE_HIP
1097 res = hip_vlsc3(u_d, v_d, w_d, n, strm_)
1098#elif HAVE_CUDA
1099 res = cuda_vlsc3(u_d, v_d, w_d, n, strm_)
1100#elif HAVE_OPENCL
1101 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1102 res = opencl_glsc3(u_d, v_d, w_d, n, strm_)
1103#else
1104 call neko_error('No device backend configured')
1105#endif
1106 end function device_vlsc3
1107
1109 function device_glsc3(a_d, b_d, c_d, n, strm) result(res)
1110 type(c_ptr) :: a_d, b_d, c_d
1111 integer :: n, ierr
1112 type(c_ptr), optional :: strm
1113 type(c_ptr) :: strm_
1114 real(kind=rp) :: res
1115
1116 if (present(strm)) then
1117 strm_ = strm
1118 else
1119 strm_ = glb_cmd_queue
1120 end if
1121
1122 res = 0.0_rp
1123#if HAVE_HIP
1124 res = hip_glsc3(a_d, b_d, c_d, n, strm_)
1125#elif HAVE_CUDA
1126 res = cuda_glsc3(a_d, b_d, c_d, n, strm_)
1127#elif HAVE_OPENCL
1128 res = opencl_glsc3(a_d, b_d, c_d, n, strm_)
1129#else
1130 call neko_error('No device backend configured')
1131#endif
1132
1133#ifndef HAVE_DEVICE_MPI
1134 if (pe_size .gt. 1) then
1135 call mpi_allreduce(mpi_in_place, res, 1, &
1136 mpi_real_precision, mpi_sum, neko_comm, ierr)
1137 end if
1138#endif
1139 end function device_glsc3
1140
1141 subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
1142 type(c_ptr), value :: w_d, v_d_d, mult_d
1143 integer(c_int) :: j, n
1144 real(c_rp) :: h(j)
1145 type(c_ptr), optional :: strm
1146 type(c_ptr) :: strm_
1147 integer :: ierr
1148
1149 if (present(strm)) then
1150 strm_ = strm
1151 else
1152 strm_ = glb_cmd_queue
1153 end if
1154
1155#if HAVE_HIP
1156 call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1157#elif HAVE_CUDA
1158 call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1159#elif HAVE_OPENCL
1160 call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1161#else
1162 call neko_error('No device backend configured')
1163#endif
1164
1165#ifndef HAVE_DEVICE_MPI
1166 if (pe_size .gt. 1) then
1167 call mpi_allreduce(mpi_in_place, h, j, &
1168 mpi_real_precision, mpi_sum, neko_comm, ierr)
1169 end if
1170#endif
1171 end subroutine device_glsc3_many
1172
1173 subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
1174 type(c_ptr), value :: y_d, x_d_d, a_d
1175 integer(c_int) :: j, n
1176 type(c_ptr), optional :: strm
1177 type(c_ptr) :: strm_
1178
1179 if (n .lt. 1) return
1180
1181 if (present(strm)) then
1182 strm_ = strm
1183 else
1184 strm_ = glb_cmd_queue
1185 end if
1186
1187#if HAVE_HIP
1188 call hip_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1189#elif HAVE_CUDA
1190 call cuda_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1191#elif HAVE_OPENCL
1192 call opencl_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1193#else
1194 call neko_error('No device backend configured')
1195#endif
1196 end subroutine device_add2s2_many
1197
1199 function device_glsc2(a_d, b_d, n, strm) result(res)
1200 type(c_ptr) :: a_d, b_d
1201 integer :: n, ierr
1202 real(kind=rp) :: res
1203 type(c_ptr), optional :: strm
1204 type(c_ptr) :: strm_
1205
1206 if (present(strm)) then
1207 strm_ = strm
1208 else
1209 strm_ = glb_cmd_queue
1210 end if
1211
1212 res = 0.0_rp
1213#if HAVE_HIP
1214 res = hip_glsc2(a_d, b_d, n, strm_)
1215#elif HAVE_CUDA
1216 res = cuda_glsc2(a_d, b_d, n, strm_)
1217#elif HAVE_OPENCL
1218 res = opencl_glsc2(a_d, b_d, n, strm_)
1219#else
1220 call neko_error('No device backend configured')
1221#endif
1222
1223#ifndef HAVE_DEVICE_MPI
1224 if (pe_size .gt. 1) then
1225 call mpi_allreduce(mpi_in_place, res, 1, &
1226 mpi_real_precision, mpi_sum, neko_comm, ierr)
1227 end if
1228#endif
1229 end function device_glsc2
1230
1233 function device_glsubnorm(a_d, b_d, n, strm) result(res)
1234 type(c_ptr), intent(in) :: a_d, b_d
1235 integer, intent(in) :: n
1236 integer :: ierr
1237 real(kind=rp) :: res
1238 type(c_ptr), optional :: strm
1239 type(c_ptr) :: strm_
1240
1241 if (present(strm)) then
1242 strm_ = strm
1243 else
1244 strm_ = glb_cmd_queue
1245 end if
1246
1247 res = 0.0_rp
1248#if HAVE_HIP
1249 res = hip_glsubnorm2(a_d, b_d, n, strm_)
1250#elif HAVE_CUDA
1251 res = cuda_glsubnorm2(a_d, b_d, n, strm_)
1252#elif HAVE_OPENCL
1253 res = opencl_glsubnorm2(a_d, b_d, n, strm_)
1254#else
1255 call neko_error('No device backend configured')
1256#endif
1257
1258#ifndef HAVE_DEVICE_MPI
1259 if (pe_size .gt. 1) then
1260 call mpi_allreduce(mpi_in_place, res, 1, &
1261 mpi_real_precision, mpi_sum, neko_comm, ierr)
1262 end if
1263#endif
1264
1265 res = sqrt(res)
1266 end function device_glsubnorm
1267
1269 function device_glsum(a_d, n, strm) result(res)
1270 type(c_ptr) :: a_d
1271 integer :: n, ierr
1272 real(kind=rp) :: res
1273 type(c_ptr), optional :: strm
1274 type(c_ptr) :: strm_
1275
1276 if (present(strm)) then
1277 strm_ = strm
1278 else
1279 strm_ = glb_cmd_queue
1280 end if
1281
1282 res = 0.0_rp
1283#if HAVE_HIP
1284 res = hip_glsum(a_d, n, strm_)
1285#elif HAVE_CUDA
1286 res = cuda_glsum(a_d, n, strm_)
1287#elif HAVE_OPENCL
1288 res = opencl_glsum(a_d, n, strm_)
1289#else
1290 call neko_error('No device backend configured')
1291#endif
1292
1293#ifndef HAVE_DEVICE_MPI
1294 if (pe_size .gt. 1) then
1295 call mpi_allreduce(mpi_in_place, res, 1, &
1296 mpi_real_precision, mpi_sum, neko_comm, ierr)
1297 end if
1298#endif
1299 end function device_glsum
1300
1301 subroutine device_absval(a_d, n, strm)
1302 integer, intent(in) :: n
1303 type(c_ptr) :: a_d
1304 type(c_ptr), optional :: strm
1305 type(c_ptr) :: strm_
1306
1307 if (n .lt. 1) return
1308
1309 if (present(strm)) then
1310 strm_ = strm
1311 else
1312 strm_ = glb_cmd_queue
1313 end if
1314
1315#ifdef HAVE_HIP
1316 call hip_absval(a_d, n, strm_)
1317#elif HAVE_CUDA
1318 call cuda_absval(a_d, n, strm_)
1319#elif HAVE_OPENCL
1320 call opencl_absval(a_d, n, strm_)
1321#else
1322 call neko_error('No device backend configured')
1323#endif
1324
1325 end subroutine device_absval
1326
1327 ! ========================================================================== !
1328 ! Device point-wise max
1329
1332 subroutine device_pwmax2(a_d, b_d, n, strm)
1333 type(c_ptr) :: a_d, b_d
1334 integer :: n
1335 type(c_ptr), optional :: strm
1336 type(c_ptr) :: strm_
1337
1338 if (n .lt. 1) return
1339
1340 if (present(strm)) then
1341 strm_ = strm
1342 else
1343 strm_ = glb_cmd_queue
1344 end if
1345
1346#if HAVE_HIP
1347 call hip_pwmax_vec2(a_d, b_d, n, strm_)
1348#elif HAVE_CUDA
1349 call cuda_pwmax_vec2(a_d, b_d, n, strm_)
1350#elif HAVE_OPENCL
1351 call opencl_pwmax_vec2(a_d, b_d, n, strm_)
1352#else
1353 call neko_error('No device backend configured')
1354#endif
1355 end subroutine device_pwmax2
1356
1359 subroutine device_pwmax3(a_d, b_d, c_d, n, strm)
1360 type(c_ptr) :: a_d, b_d, c_d
1361 integer :: n
1362 type(c_ptr), optional :: strm
1363 type(c_ptr) :: strm_
1364
1365 if (n .lt. 1) return
1366
1367 if (present(strm)) then
1368 strm_ = strm
1369 else
1370 strm_ = glb_cmd_queue
1371 end if
1372
1373#if HAVE_HIP
1374 call hip_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1375#elif HAVE_CUDA
1376 call cuda_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1377#elif HAVE_OPENCL
1378 call opencl_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1379#else
1380 call neko_error('No device backend configured')
1381#endif
1382
1383 end subroutine device_pwmax3
1384
1387 subroutine device_cpwmax2(a_d, c, n, strm)
1388 type(c_ptr) :: a_d
1389 real(kind=rp), intent(in) :: c
1390 integer :: n
1391 type(c_ptr), optional :: strm
1392 type(c_ptr) :: strm_
1393
1394 if (n .lt. 1) return
1395
1396 if (present(strm)) then
1397 strm_ = strm
1398 else
1399 strm_ = glb_cmd_queue
1400 end if
1401
1402#if HAVE_HIP
1403 call hip_pwmax_sca2(a_d, c, n, strm_)
1404#elif HAVE_CUDA
1405 call cuda_pwmax_sca2(a_d, c, n, strm_)
1406#elif HAVE_OPENCL
1407 call opencl_pwmax_sca2(a_d, c, n, strm_)
1408#else
1409 call neko_error('No device backend configured')
1410#endif
1411
1412 end subroutine device_cpwmax2
1413
1416 subroutine device_cpwmax3(a_d, b_d, c, n, strm)
1417 type(c_ptr) :: a_d, b_d
1418 real(kind=rp), intent(in) :: c
1419 integer :: n
1420 type(c_ptr), optional :: strm
1421 type(c_ptr) :: strm_
1422
1423 if (n .lt. 1) return
1424
1425 if (present(strm)) then
1426 strm_ = strm
1427 else
1428 strm_ = glb_cmd_queue
1429 end if
1430
1431#if HAVE_HIP
1432 call hip_pwmax_sca3(a_d, b_d, c, n, strm_)
1433#elif HAVE_CUDA
1434 call cuda_pwmax_sca3(a_d, b_d, c, n, strm_)
1435#elif HAVE_OPENCL
1436 call opencl_pwmax_sca3(a_d, b_d, c, n, strm_)
1437#else
1438 call neko_error('No device backend configured')
1439#endif
1440
1441 end subroutine device_cpwmax3
1442
1443 ! ========================================================================== !
1444 ! Device point-wise min
1445
1448 subroutine device_pwmin2(a_d, b_d, n, strm)
1449 type(c_ptr) :: a_d, b_d
1450 integer :: n
1451 type(c_ptr), optional :: strm
1452 type(c_ptr) :: strm_
1453
1454 if (n .lt. 1) return
1455
1456 if (present(strm)) then
1457 strm_ = strm
1458 else
1459 strm_ = glb_cmd_queue
1460 end if
1461
1462#if HAVE_HIP
1463 call hip_pwmin_vec2(a_d, b_d, n, strm_)
1464#elif HAVE_CUDA
1465 call cuda_pwmin_vec2(a_d, b_d, n, strm_)
1466#elif HAVE_OPENCL
1467 call opencl_pwmin_vec2(a_d, b_d, n, strm_)
1468#else
1469 call neko_error('No device backend configured')
1470#endif
1471 end subroutine device_pwmin2
1472
1475 subroutine device_pwmin3(a_d, b_d, c_d, n, strm)
1476 type(c_ptr) :: a_d, b_d, c_d
1477 integer :: n
1478 type(c_ptr), optional :: strm
1479 type(c_ptr) :: strm_
1480
1481 if (n .lt. 1) return
1482
1483 if (present(strm)) then
1484 strm_ = strm
1485 else
1486 strm_ = glb_cmd_queue
1487 end if
1488
1489#if HAVE_HIP
1490 call hip_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1491#elif HAVE_CUDA
1492 call cuda_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1493#elif HAVE_OPENCL
1494 call opencl_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1495#else
1496 call neko_error('No device backend configured')
1497#endif
1498
1499 end subroutine device_pwmin3
1500
1503 subroutine device_cpwmin2(a_d, c, n, strm)
1504 type(c_ptr) :: a_d
1505 real(kind=rp), intent(in) :: c
1506 integer :: n
1507 type(c_ptr), optional :: strm
1508 type(c_ptr) :: strm_
1509
1510 if (n .lt. 1) return
1511
1512 if (present(strm)) then
1513 strm_ = strm
1514 else
1515 strm_ = glb_cmd_queue
1516 end if
1517
1518#if HAVE_HIP
1519 call hip_pwmin_sca2(a_d, c, n, strm_)
1520#elif HAVE_CUDA
1521 call cuda_pwmin_sca2(a_d, c, n, strm_)
1522#elif HAVE_OPENCL
1523 call opencl_pwmin_sca2(a_d, c, n, strm_)
1524#else
1525 call neko_error('No device backend configured')
1526#endif
1527
1528 end subroutine device_cpwmin2
1529
1532 subroutine device_cpwmin3(a_d, b_d, c, n, strm)
1533 type(c_ptr) :: a_d, b_d
1534 real(kind=rp), intent(in) :: c
1535 integer :: n
1536 type(c_ptr), optional :: strm
1537 type(c_ptr) :: strm_
1538
1539 if (n .lt. 1) return
1540
1541 if (present(strm)) then
1542 strm_ = strm
1543 else
1544 strm_ = glb_cmd_queue
1545 end if
1546
1547#if HAVE_HIP
1548 call hip_pwmin_sca3(a_d, b_d, c, n, strm_)
1549#elif HAVE_CUDA
1550 call cuda_pwmin_sca3(a_d, b_d, c, n, strm_)
1551#elif HAVE_OPENCL
1552 call opencl_pwmin_sca3(a_d, b_d, c, n, strm_)
1553#else
1554 call neko_error('No device backend configured')
1555#endif
1556
1557 end subroutine device_cpwmin3
1558
1559 ! ========================================================================== !
1560 ! Integer operations
1561
1563 subroutine device_iadd(a_d, c, n, strm)
1564 type(c_ptr), intent(inout) :: a_d
1565 integer, intent(in) :: c
1566 integer, intent(in) :: n
1567 type(c_ptr), optional :: strm
1568 type(c_ptr) :: strm_
1569 if (n .lt. 1) return
1570
1571 if (present(strm)) then
1572 strm_ = strm
1573 else
1574 strm_ = glb_cmd_queue
1575 end if
1576
1577#if HAVE_HIP
1578 call hip_iadd(a_d, c, n, strm_)
1579#elif HAVE_CUDA
1580 call cuda_iadd(a_d, c, n, strm_)
1581#elif HAVE_OPENCL
1582 call opencl_iadd(a_d, c, n, strm_)
1583#else
1584 call neko_error('No device backend configured')
1585#endif
1586 end subroutine device_iadd
1587
1588
1589end module device_math
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:51
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:43
subroutine, public device_pwmin2(a_d, b_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
subroutine, public device_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm)
Returns .
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_addcol3(a_d, b_d, c_d, n, strm)
Returns .
real(kind=rp) function, public device_glsum(a_d, n, strm)
Sum a vector of length n.
subroutine, public device_pwmax3(a_d, b_d, c_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine, public device_invcol1(a_d, n, strm)
Invert a vector .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
subroutine, public device_cpwmax3(a_d, b_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_rone(a_d, n, strm)
Set all elements to one.
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm)
Compute a cross product (3-d version) assuming vector components etc.
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
real(kind=rp) function, public device_glsubnorm(a_d, b_d, n, strm)
Returns the norm of the difference of two vectors .
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
subroutine, public device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
Fill a constant to a masked vector. .
subroutine, public device_cadd2(a_d, b_d, c, n, strm)
Add a scalar to vector .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
subroutine, public device_invcol3(a_d, b_d, c_d, n, strm)
Vector division .
subroutine, public device_pwmin3(a_d, b_d, c_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_col2(a_d, b_d, n, strm)
Vector multiplication .
subroutine, public device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n, strm)
Compute multiplication sum .
subroutine, public device_cdiv2(a_d, b_d, c, n, strm)
Division of constant c by array .
subroutine, public device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm)
Returns .
subroutine, public device_add4(a_d, b_d, c_d, d_d, n, strm)
subroutine, public device_cpwmax2(a_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_subcol3(a_d, b_d, c_d, n, strm)
Returns .
subroutine, public device_cdiv(a_d, c, n, strm)
Division of constant c by array .
subroutine, public device_absval(a_d, n, strm)
subroutine device_iadd(a_d, c, n, strm)
Add an integer scalar to vector .
subroutine, public device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
subroutine, public device_invcol2(a_d, b_d, n, strm)
Vector division .
subroutine, public device_addsqr2s2(a_d, b_d, c1, n, strm)
Returns .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cpwmin3(a_d, b_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_col3(a_d, b_d, c_d, n, strm)
Vector multiplication with 3 vectors .
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n, strm)
Returns .
subroutine, public device_cfill(a_d, c, n, strm)
Set all elements to a constant c .
subroutine, public device_cpwmin2(a_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_add3(a_d, b_d, c_d, n, strm)
Vector addition .
subroutine, public device_addcol3s2(a_d, b_d, c_d, s, n, strm)
Returns .
subroutine, public device_pwmax2(a_d, b_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine device_radd(a_d, c, n, strm)
Add a scalar to vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35