Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
device_math.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
34 use, intrinsic :: iso_c_binding, only: c_ptr, c_int
35 use num_types, only : rp, c_rp
36 use utils, only : neko_error
38 use mpi_f08, only : mpi_sum, mpi_min, mpi_max, mpi_in_place, mpi_allreduce
39 use device, only : glb_cmd_queue
40 ! ========================================================================== !
41 ! Device math interfaces
42
43 use hip_math
44 use cuda_math
45 use opencl_math
46
47 implicit none
48 private
49
50 interface device_cadd
51 module procedure device_radd, device_iadd
52 end interface device_cadd
53
70
71contains
72
74 subroutine device_copy(a_d, b_d, n, strm)
75 type(c_ptr) :: a_d, b_d
76 integer :: n
77 type(c_ptr), optional :: strm
78 type(c_ptr) :: strm_
79
80 if (n .lt. 1) return
81
82 if (present(strm)) then
83 strm_ = strm
84 else
85 strm_ = glb_cmd_queue
86 end if
87
88#if HAVE_HIP
89 call hip_copy(a_d, b_d, n, strm_)
90#elif HAVE_CUDA
91 call cuda_copy(a_d, b_d, n, strm_)
92#elif HAVE_OPENCL
93 call opencl_copy(a_d, b_d, n, strm_)
94#else
95 call neko_error('no device backend configured')
96#endif
97 end subroutine device_copy
98
100 subroutine device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
101 type(c_ptr) :: a_d, b_d, mask_d
102 integer :: n, n_mask
103 type(c_ptr), optional :: strm
104 type(c_ptr) :: strm_
105
106 if (n .lt. 1 .or. n_mask .lt. 1) return
107
108 if (present(strm)) then
109 strm_ = strm
110 else
111 strm_ = glb_cmd_queue
112 end if
113
114#if HAVE_HIP
115 call hip_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
116#elif HAVE_CUDA
117 call cuda_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
118#elif HAVE_OPENCL
119 call opencl_masked_copy(a_d, b_d, mask_d, n, n_mask, strm_)
120#else
121 call neko_error('no device backend configured')
122#endif
123 end subroutine device_masked_copy_0
124
126 subroutine device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
127 type(c_ptr) :: a_d, b_d, mask_d
128 integer :: n, n_mask
129 type(c_ptr), optional :: strm
130 type(c_ptr) :: strm_
131
132 if (n .lt. 1 .or. n_mask .lt. 1) return
133
134 if (present(strm)) then
135 strm_ = strm
136 else
137 strm_ = glb_cmd_queue
138 end if
139
140#if HAVE_HIP
141 call hip_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
142#elif HAVE_CUDA
143 call cuda_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
144#elif HAVE_OPENCL
145 call opencl_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
146#else
147 call neko_error('no device backend configured')
148#endif
149 end subroutine device_masked_gather_copy_0
150
152 ! In this case, the mask comes from a mask_t type
153 subroutine device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
154 type(c_ptr) :: a_d, b_d, mask_d
155 integer :: n, n_mask
156 type(c_ptr), optional :: strm
157 type(c_ptr) :: strm_
158
159 if (n .lt. 1 .or. n_mask .lt. 1) return
160
161 if (present(strm)) then
162 strm_ = strm
163 else
164 strm_ = glb_cmd_queue
165 end if
166
167#if HAVE_HIP
168 call hip_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
169#elif HAVE_CUDA
170 call cuda_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
171#elif HAVE_OPENCL
172 call opencl_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
173#else
174 call neko_error('no device backend configured')
175#endif
177
179 subroutine device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
180 type(c_ptr) :: a_d, b_d, mask_d
181 integer :: n, n_mask
182 type(c_ptr), optional :: strm
183 type(c_ptr) :: strm_
184
185 if (n .lt. 1 .or. n_mask .lt. 1) return
186
187 if (present(strm)) then
188 strm_ = strm
189 else
190 strm_ = glb_cmd_queue
191 end if
192
193#if HAVE_HIP
194 call hip_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
195#elif HAVE_CUDA
196 call cuda_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
197#elif HAVE_OPENCL
198 call opencl_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
199#else
200 call neko_error('no device backend configured')
201#endif
202 end subroutine device_masked_scatter_copy_0
203
204 subroutine device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
205 type(c_ptr) :: a_d, b_d, mask_d
206 integer :: n, n_mask
207 type(c_ptr), optional :: strm
208 type(c_ptr) :: strm_
209
210 if (n .lt. 1 .or. n_mask .lt. 1) return
211
212 if (present(strm)) then
213 strm_ = strm
214 else
215 strm_ = glb_cmd_queue
216 end if
217
218#if HAVE_HIP
219 call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
220#elif HAVE_CUDA
221 call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
222#elif HAVE_OPENCL
223 call neko_error('No OpenCL bcknd, masked atomic reduction')
224#else
225 call neko_error('no device backend configured')
226#endif
228
231 subroutine device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
232 type(c_ptr) :: a_d
233 real(kind=rp), intent(in) :: c
234 integer :: n
235 type(c_ptr) :: mask_d
236 integer :: n_mask
237 type(c_ptr), optional :: strm
238 type(c_ptr) :: strm_
239
240 if (n .lt. 1 .or. n_mask .lt. 1) return
241
242 if (present(strm)) then
243 strm_ = strm
244 else
245 strm_ = glb_cmd_queue
246 end if
247
248#if HAVE_HIP
249 call hip_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
250#elif HAVE_CUDA
251 call cuda_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
252#elif HAVE_OPENCL
253 call opencl_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
254#else
255 call neko_error('No device backend configured')
256#endif
257 end subroutine device_cfill_mask
258
260 subroutine device_rzero(a_d, n, strm)
261 type(c_ptr) :: a_d
262 integer :: n
263 type(c_ptr), optional :: strm
264 type(c_ptr) :: strm_
265
266 if (n .lt. 1) return
267
268 if (present(strm)) then
269 strm_ = strm
270 else
271 strm_ = glb_cmd_queue
272 end if
273
274#if HAVE_HIP
275 call hip_rzero(a_d, n, strm_)
276#elif HAVE_CUDA
277 call cuda_rzero(a_d, n, strm_)
278#elif HAVE_OPENCL
279 call opencl_rzero(a_d, n, strm_)
280#else
281 call neko_error('No device backend configured')
282#endif
283 end subroutine device_rzero
284
286 subroutine device_rone(a_d, n, strm)
287 type(c_ptr) :: a_d
288 integer :: n
289 type(c_ptr), optional :: strm
290 type(c_ptr) :: strm_
291 real(kind=rp), parameter :: one = 1.0_rp
292
293 if (n .lt. 1) return
294
295 if (present(strm)) then
296 strm_ = strm
297 else
298 strm_ = glb_cmd_queue
299 end if
300
301#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL
302 call device_cfill(a_d, one, n, strm_)
303#else
304 call neko_error('No device backend configured')
305#endif
306 end subroutine device_rone
307
309 subroutine device_cmult(a_d, c, n, strm)
310 type(c_ptr) :: a_d
311 real(kind=rp), intent(in) :: c
312 integer :: n
313 type(c_ptr), optional :: strm
314 type(c_ptr) :: strm_
315
316 if (n .lt. 1) return
317
318 if (present(strm)) then
319 strm_ = strm
320 else
321 strm_ = glb_cmd_queue
322 end if
323
324#if HAVE_HIP
325 call hip_cmult(a_d, c, n, strm_)
326#elif HAVE_CUDA
327 call cuda_cmult(a_d, c, n, strm_)
328#elif HAVE_OPENCL
329 call opencl_cmult(a_d, c, n, strm_)
330#else
331 call neko_error('No device backend configured')
332#endif
333 end subroutine device_cmult
334
336 subroutine device_cmult2(a_d, b_d, c, n, strm)
337 type(c_ptr) :: a_d, b_d
338 real(kind=rp), intent(in) :: c
339 integer :: n
340 type(c_ptr), optional :: strm
341 type(c_ptr) :: strm_
342
343 if (n .lt. 1) return
344
345 if (present(strm)) then
346 strm_ = strm
347 else
348 strm_ = glb_cmd_queue
349 end if
350
351#if HAVE_HIP
352 call hip_cmult2(a_d, b_d, c, n, strm_)
353#elif HAVE_CUDA
354 call cuda_cmult2(a_d, b_d, c, n, strm_)
355#elif HAVE_OPENCL
356 call opencl_cmult2(a_d, b_d, c, n, strm_)
357#else
358 call neko_error('No device backend configured')
359#endif
360 end subroutine device_cmult2
361
363 subroutine device_cdiv(a_d, c, n, strm)
364 type(c_ptr) :: a_d
365 real(kind=rp), intent(in) :: c
366 integer :: n
367 type(c_ptr), optional :: strm
368 type(c_ptr) :: strm_
369
370 if (present(strm)) then
371 strm_ = strm
372 else
373 strm_ = glb_cmd_queue
374 end if
375
376#if HAVE_HIP
377 call hip_cdiv(a_d, c, n, strm_)
378#elif HAVE_CUDA
379 call cuda_cdiv(a_d, c, n, strm_)
380#elif HAVE_OPENCL
381 call opencl_cdiv(a_d, c, n, strm_)
382#else
383 call neko_error('No device backend configured')
384#endif
385 end subroutine device_cdiv
386
388 subroutine device_cdiv2(a_d, b_d, c, n, strm)
389 type(c_ptr) :: a_d, b_d
390 real(kind=rp), intent(in) :: c
391 integer :: n
392 type(c_ptr), optional :: strm
393 type(c_ptr) :: strm_
394
395 if (present(strm)) then
396 strm_ = strm
397 else
398 strm_ = glb_cmd_queue
399 end if
400
401#if HAVE_HIP
402 call hip_cdiv2(a_d, b_d, c, n, strm_)
403#elif HAVE_CUDA
404 call cuda_cdiv2(a_d, b_d, c, n, strm_)
405#elif HAVE_OPENCL
406 call opencl_cdiv2(a_d, b_d, c, n, strm_)
407#else
408 call neko_error('No device backend configured')
409#endif
410 end subroutine device_cdiv2
411
413 subroutine device_radd(a_d, c, n, strm)
414 type(c_ptr) :: a_d
415 real(kind=rp), intent(in) :: c
416 integer :: n
417 type(c_ptr), optional :: strm
418 type(c_ptr) :: strm_
419
420 if (n .lt. 1) return
421
422 if (present(strm)) then
423 strm_ = strm
424 else
425 strm_ = glb_cmd_queue
426 end if
427
428#if HAVE_HIP
429 call hip_radd(a_d, c, n, strm_)
430#elif HAVE_CUDA
431 call cuda_radd(a_d, c, n, strm_)
432#elif HAVE_OPENCL
433 call opencl_radd(a_d, c, n, strm_)
434#else
435 call neko_error('No device backend configured')
436#endif
437 end subroutine device_radd
438
440 subroutine device_cadd2(a_d, b_d, c, n, strm)
441 type(c_ptr) :: a_d
442 type(c_ptr) :: b_d
443 real(kind=rp), intent(in) :: c
444 integer :: n
445 type(c_ptr), optional :: strm
446 type(c_ptr) :: strm_
447
448 if (n .lt. 1) return
449
450 if (present(strm)) then
451 strm_ = strm
452 else
453 strm_ = glb_cmd_queue
454 end if
455
456#if HAVE_HIP
457 call hip_cadd2(a_d, b_d, c, n, strm_)
458#elif HAVE_CUDA
459 call cuda_cadd2(a_d, b_d, c, n, strm_)
460#elif HAVE_OPENCL
461 call opencl_cadd2(a_d, b_d, c, n, strm_)
462#else
463 call neko_error('No device backend configured')
464#endif
465 end subroutine device_cadd2
466
468 subroutine device_cfill(a_d, c, n, strm)
469 type(c_ptr) :: a_d
470 real(kind=rp), intent(in) :: c
471 integer :: n
472 type(c_ptr), optional ::strm
473 type(c_ptr) :: strm_
474
475 if (n .lt. 1) return
476
477 if (present(strm)) then
478 strm_ = strm
479 else
480 strm_ = glb_cmd_queue
481 end if
482
483#if HAVE_HIP
484 call hip_cfill(a_d, c, n, strm_)
485#elif HAVE_CUDA
486 call cuda_cfill(a_d, c, n, strm_)
487#elif HAVE_OPENCL
488 call opencl_cfill(a_d, c, n, strm_)
489#else
490 call neko_error('No device backend configured')
491#endif
492 end subroutine device_cfill
493
495 subroutine device_add2(a_d, b_d, n, strm)
496 type(c_ptr) :: a_d, b_d
497 integer :: n
498 type(c_ptr), optional :: strm
499 type(c_ptr) :: strm_
500
501 if (n .lt. 1) return
502
503 if (present(strm)) then
504 strm_ = strm
505 else
506 strm_ = glb_cmd_queue
507 end if
508
509#if HAVE_HIP
510 call hip_add2(a_d, b_d, n, strm_)
511#elif HAVE_CUDA
512 call cuda_add2(a_d, b_d, n, strm_)
513#elif HAVE_OPENCL
514 call opencl_add2(a_d, b_d, n, strm_)
515#else
516 call neko_error('No device backend configured')
517#endif
518 end subroutine device_add2
519
520 subroutine device_add4(a_d, b_d, c_d, d_d, n, strm)
521 type(c_ptr) :: a_d, b_d, c_d, d_d
522 integer :: n
523 type(c_ptr), optional :: strm
524 type(c_ptr) :: strm_
525
526 if (n .lt. 1) return
527
528 if (present(strm)) then
529 strm_ = strm
530 else
531 strm_ = glb_cmd_queue
532 end if
533
534#if HAVE_HIP
535 call hip_add4(a_d, b_d, c_d, d_d, n, strm_)
536#elif HAVE_CUDA
537 call cuda_add4(a_d, b_d, c_d, d_d, n, strm_)
538#elif HAVE_OPENCL
539 call opencl_add4(a_d, b_d, c_d, d_d, n, strm_)
540#else
541 call neko_error('No device backend configured')
542#endif
543 end subroutine device_add4
544
545 subroutine device_add2s1(a_d, b_d, c1, n, strm)
546 type(c_ptr) :: a_d, b_d
547 real(kind=rp) :: c1
548 integer :: n
549 type(c_ptr), optional :: strm
550 type(c_ptr) :: strm_
551
552 if (n .lt. 1) return
553
554 if (present(strm)) then
555 strm_ = strm
556 else
557 strm_ = glb_cmd_queue
558 end if
559
560#if HAVE_HIP
561 call hip_add2s1(a_d, b_d, c1, n, strm_)
562#elif HAVE_CUDA
563 call cuda_add2s1(a_d, b_d, c1, n, strm_)
564#elif HAVE_OPENCL
565 call opencl_add2s1(a_d, b_d, c1, n, strm_)
566#else
567 call neko_error('No device backend configured')
568#endif
569 end subroutine device_add2s1
570
573 subroutine device_add2s2(a_d, b_d, c1, n, strm)
574 type(c_ptr) :: a_d, b_d
575 real(kind=rp) :: c1
576 integer :: n
577 type(c_ptr), optional :: strm
578 type(c_ptr) :: strm_
579
580 if (n .lt. 1) return
581
582 if (present(strm)) then
583 strm_ = strm
584 else
585 strm_ = glb_cmd_queue
586 end if
587
588#if HAVE_HIP
589 call hip_add2s2(a_d, b_d, c1, n, strm_)
590#elif HAVE_CUDA
591 call cuda_add2s2(a_d, b_d, c1, n, strm_)
592#elif HAVE_OPENCL
593 call opencl_add2s2(a_d, b_d, c1, n, strm_)
594#else
595 call neko_error('No device backend configured')
596#endif
597 end subroutine device_add2s2
598
600 subroutine device_addsqr2s2(a_d, b_d, c1, n, strm)
601 type(c_ptr) :: a_d, b_d
602 real(kind=rp) :: c1
603 integer :: n
604 type(c_ptr), optional :: strm
605 type(c_ptr) :: strm_
606
607 if (n .lt. 1) return
608
609 if (present(strm)) then
610 strm_ = strm
611 else
612 strm_ = glb_cmd_queue
613 end if
614
615#if HAVE_HIP
616 call hip_addsqr2s2(a_d, b_d, c1, n, strm_)
617#elif HAVE_CUDA
618 call cuda_addsqr2s2(a_d, b_d, c1, n, strm_)
619#elif HAVE_OPENCL
620 call opencl_addsqr2s2(a_d, b_d, c1, n, strm_)
621#else
622 call neko_error('No device backend configured')
623#endif
624 end subroutine device_addsqr2s2
625
627 subroutine device_add3(a_d, b_d, c_d, n, strm)
628 type(c_ptr) :: a_d, b_d, c_d
629 integer :: n
630 type(c_ptr), optional :: strm
631 type(c_ptr) :: strm_
632
633 if (n .lt. 1) return
634
635 if (present(strm)) then
636 strm_ = strm
637 else
638 strm_ = glb_cmd_queue
639 end if
640
641#if HAVE_HIP
642 call hip_add3(a_d, b_d, c_d, n, strm_)
643#elif HAVE_CUDA
644 call cuda_add3(a_d, b_d, c_d, n, strm_)
645#elif HAVE_OPENCL
646 call opencl_add3(a_d, b_d, c_d, n, strm_)
647#else
648 call neko_error('No device backend configured')
649#endif
650 end subroutine device_add3
651
653 subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n, strm)
654 type(c_ptr) :: a_d, b_d, c_d
655 real(kind=rp) :: c1, c2
656 integer :: n
657 type(c_ptr), optional :: strm
658 type(c_ptr) :: strm_
659
660 if (n .lt. 1) return
661
662 if (present(strm)) then
663 strm_ = strm
664 else
665 strm_ = glb_cmd_queue
666 end if
667
668#if HAVE_HIP
669 call hip_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
670#elif HAVE_CUDA
671 call cuda_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
672#elif HAVE_OPENCL
673 call opencl_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
674#else
675 call neko_error('No device backend configured')
676#endif
677 end subroutine device_add3s2
678
680 subroutine device_add4s3(a_d, b_d, c_d, d_d, c1, c2 , c3, n, strm)
681 type(c_ptr) :: a_d, b_d, c_d, d_d
682 real(kind=rp) :: c1, c2, c3
683 integer :: n
684 type(c_ptr), optional :: strm
685 type(c_ptr) :: strm_
686
687 if (n .lt. 1) return
688
689 if (present(strm)) then
690 strm_ = strm
691 else
692 strm_ = glb_cmd_queue
693 end if
694
695#if HAVE_HIP
696 call hip_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
697#elif HAVE_CUDA
698 call cuda_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
699#elif HAVE_OPENCL
700 call opencl_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
701#else
702 call neko_error('No device backend configured')
703#endif
704 end subroutine device_add4s3
705
707 subroutine device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2 , c3, c4, n, strm)
708 type(c_ptr) :: a_d, b_d, c_d, d_d, e_d
709 real(kind=rp) :: c1, c2, c3, c4
710 integer :: n
711 type(c_ptr), optional :: strm
712 type(c_ptr) :: strm_
713
714 if (n .lt. 1) return
715
716 if (present(strm)) then
717 strm_ = strm
718 else
719 strm_ = glb_cmd_queue
720 end if
721
722#if HAVE_HIP
723 call hip_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
724#elif HAVE_CUDA
725 call cuda_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
726#elif HAVE_OPENCL
727 call opencl_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
728#else
729 call neko_error('No device backend configured')
730#endif
731 end subroutine device_add5s4
732
734 subroutine device_invcol1(a_d, n, strm)
735 type(c_ptr) :: a_d
736 integer :: n
737 type(c_ptr), optional :: strm
738 type(c_ptr) :: strm_
739
740 if (n .lt. 1) return
741
742 if (present(strm)) then
743 strm_ = strm
744 else
745 strm_ = glb_cmd_queue
746 end if
747
748#if HAVE_HIP
749 call hip_invcol1(a_d, n, strm_)
750#elif HAVE_CUDA
751 call cuda_invcol1(a_d, n, strm_)
752#elif HAVE_OPENCL
753 call opencl_invcol1(a_d, n, strm_)
754#else
755 call neko_error('No device backend configured')
756#endif
757 end subroutine device_invcol1
758
760 subroutine device_invcol2(a_d, b_d, n, strm)
761 type(c_ptr) :: a_d, b_d
762 integer :: n
763 type(c_ptr), optional :: strm
764 type(c_ptr) :: strm_
765
766 if (n .lt. 1) return
767
768 if (present(strm)) then
769 strm_ = strm
770 else
771 strm_ = glb_cmd_queue
772 end if
773
774#if HAVE_HIP
775 call hip_invcol2(a_d, b_d, n, strm_)
776#elif HAVE_CUDA
777 call cuda_invcol2(a_d, b_d, n, strm_)
778#elif HAVE_OPENCL
779 call opencl_invcol2(a_d, b_d, n, strm_)
780#else
781 call neko_error('No device backend configured')
782#endif
783 end subroutine device_invcol2
784
786 subroutine device_invcol3(a_d, b_d, c_d, n, strm)
787 type(c_ptr) :: a_d, b_d, c_d
788 integer :: n
789 type(c_ptr), optional :: strm
790 type(c_ptr) :: strm_
791
792 if (present(strm)) then
793 strm_ = strm
794 else
795 strm_ = glb_cmd_queue
796 end if
797
798#ifdef HAVE_HIP
799 call hip_invcol3(a_d, b_d, c_d, n, strm_)
800#elif HAVE_CUDA
801 call cuda_invcol3(a_d, b_d, c_d, n, strm_)
802#elif HAVE_OPENCL
803 ! call opencl_invcol3(a_d, b_d, c_d, n)
804 call neko_error('opencl_invcol3 not implemented')
805#else
806 call neko_error('No device backend configured')
807#endif
808 end subroutine device_invcol3
809
811 subroutine device_col2(a_d, b_d, n, strm)
812 type(c_ptr) :: a_d, b_d
813 integer :: n
814 type(c_ptr), optional :: strm
815 type(c_ptr) :: strm_
816
817 if (present(strm)) then
818 strm_ = strm
819 else
820 strm_ = glb_cmd_queue
821 end if
822
823 if (n .lt. 1) return
824#if HAVE_HIP
825 call hip_col2(a_d, b_d, n, strm_)
826#elif HAVE_CUDA
827 call cuda_col2(a_d, b_d, n, strm_)
828#elif HAVE_OPENCL
829 call opencl_col2(a_d, b_d, n, strm_)
830#else
831 call neko_error('No device backend configured')
832#endif
833 end subroutine device_col2
834
836 subroutine device_col3(a_d, b_d, c_d, n, strm)
837 type(c_ptr) :: a_d, b_d, c_d
838 integer :: n
839 type(c_ptr), optional :: strm
840 type(c_ptr) :: strm_
841
842 if (n .lt. 1) return
843
844 if (present(strm)) then
845 strm_ = strm
846 else
847 strm_ = glb_cmd_queue
848 end if
849
850#if HAVE_HIP
851 call hip_col3(a_d, b_d, c_d, n, strm_)
852#elif HAVE_CUDA
853 call cuda_col3(a_d, b_d, c_d, n, strm_)
854#elif HAVE_OPENCL
855 call opencl_col3(a_d, b_d, c_d, n, strm_)
856#else
857 call neko_error('No device backend configured')
858#endif
859 end subroutine device_col3
860
862 subroutine device_subcol3(a_d, b_d, c_d, n, strm)
863 type(c_ptr) :: a_d, b_d, c_d
864 integer :: n
865 type(c_ptr), optional :: strm
866 type(c_ptr) :: strm_
867
868 if (n .lt. 1) return
869
870 if (present(strm)) then
871 strm_ = strm
872 else
873 strm_ = glb_cmd_queue
874 end if
875
876#if HAVE_HIP
877 call hip_subcol3(a_d, b_d, c_d, n, strm_)
878#elif HAVE_CUDA
879 call cuda_subcol3(a_d, b_d, c_d, n, strm_)
880#elif HAVE_OPENCL
881 call opencl_subcol3(a_d, b_d, c_d, n, strm_)
882#else
883 call neko_error('No device backend configured')
884#endif
885 end subroutine device_subcol3
886
888 subroutine device_sub2(a_d, b_d, n, strm)
889 type(c_ptr) :: a_d, b_d
890 integer :: n
891 type(c_ptr), optional :: strm
892 type(c_ptr) :: strm_
893
894 if (n .lt. 1) return
895
896 if (present(strm)) then
897 strm_ = strm
898 else
899 strm_ = glb_cmd_queue
900 end if
901
902#if HAVE_HIP
903 call hip_sub2(a_d, b_d, n, strm_)
904#elif HAVE_CUDA
905 call cuda_sub2(a_d, b_d, n, strm_)
906#elif HAVE_OPENCL
907 call opencl_sub2(a_d, b_d, n, strm_)
908#else
909 call neko_error('No device backend configured')
910#endif
911 end subroutine device_sub2
912
914 subroutine device_sub3(a_d, b_d, c_d, n, strm)
915 type(c_ptr) :: a_d, b_d, c_d
916 integer :: n
917 type(c_ptr), optional :: strm
918 type(c_ptr) :: strm_
919
920 if (n .lt. 1) return
921
922 if (present(strm)) then
923 strm_ = strm
924 else
925 strm_ = glb_cmd_queue
926 end if
927
928#if HAVE_HIP
929 call hip_sub3(a_d, b_d, c_d, n, strm_)
930#elif HAVE_CUDA
931 call cuda_sub3(a_d, b_d, c_d, n, strm_)
932#elif HAVE_OPENCL
933 call opencl_sub3(a_d, b_d, c_d, n, strm_)
934#else
935 call neko_error('No device backend configured')
936#endif
937 end subroutine device_sub3
938
940 subroutine device_addcol3(a_d, b_d, c_d, n, strm)
941 type(c_ptr) :: a_d, b_d, c_d
942 integer :: n
943 type(c_ptr), optional :: strm
944 type(c_ptr) :: strm_
945
946 if (n .lt. 1) return
947
948 if (present(strm)) then
949 strm_ = strm
950 else
951 strm_ = glb_cmd_queue
952 end if
953
954#if HAVE_HIP
955 call hip_addcol3(a_d, b_d, c_d, n, strm_)
956#elif HAVE_CUDA
957 call cuda_addcol3(a_d, b_d, c_d, n, strm_)
958#elif HAVE_OPENCL
959 call opencl_addcol3(a_d, b_d, c_d, n, strm_)
960#else
961 call neko_error('No device backend configured')
962#endif
963 end subroutine device_addcol3
964
966 subroutine device_addcol4(a_d, b_d, c_d, d_d, n, strm)
967 type(c_ptr) :: a_d, b_d, c_d, d_d
968 integer :: n
969 type(c_ptr), optional :: strm
970 type(c_ptr) :: strm_
971
972 if (n .lt. 1) return
973
974 if (present(strm)) then
975 strm_ = strm
976 else
977 strm_ = glb_cmd_queue
978 end if
979
980#if HAVE_HIP
981 call hip_addcol4(a_d, b_d, c_d, d_d, n, strm_)
982#elif HAVE_CUDA
983 call cuda_addcol4(a_d, b_d, c_d, d_d, n, strm_)
984#elif HAVE_OPENCL
985 call opencl_addcol4(a_d, b_d, c_d, d_d, n, strm_)
986#else
987 call neko_error('No device backend configured')
988#endif
989 end subroutine device_addcol4
990
992 subroutine device_addcol3s2(a_d, b_d, c_d, s, n, strm)
993 type(c_ptr) :: a_d, b_d, c_d
994 real(kind=rp) :: s
995 integer :: n
996 type(c_ptr), optional :: strm
997 type(c_ptr) :: strm_
998
999 if (n .lt. 1) return
1000
1001 if (present(strm)) then
1002 strm_ = strm
1003 else
1004 strm_ = glb_cmd_queue
1005 end if
1006
1007#if HAVE_HIP
1008 call hip_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1009#elif HAVE_CUDA
1010 call cuda_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1011#elif HAVE_OPENCL
1012 call opencl_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1013#else
1014 call neko_error('No device backend configured')
1015#endif
1016 end subroutine device_addcol3s2
1017
1020 subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
1021 type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d
1022 integer :: n
1023 type(c_ptr), optional :: strm
1024 type(c_ptr) :: strm_
1025
1026 if (n .lt. 1) return
1027
1028 if (present(strm)) then
1029 strm_ = strm
1030 else
1031 strm_ = glb_cmd_queue
1032 end if
1033
1034#if HAVE_HIP
1035 call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1036#elif HAVE_CUDA
1037 call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1038#elif HAVE_OPENCL
1039 call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1040#else
1041 call neko_error('No device backend configured')
1042#endif
1043 end subroutine device_vdot3
1044
1047 subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1048 w1_d, w2_d, w3_d, n, strm)
1049 type(c_ptr) :: u1_d, u2_d, u3_d
1050 type(c_ptr) :: v1_d, v2_d, v3_d
1051 type(c_ptr) :: w1_d, w2_d, w3_d
1052 integer :: n
1053 type(c_ptr), optional :: strm
1054 type(c_ptr) :: strm_
1055
1056 if (n .lt. 1) return
1057
1058 if (present(strm)) then
1059 strm_ = strm
1060 else
1061 strm_ = glb_cmd_queue
1062 end if
1063
1064#if HAVE_HIP
1065 call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1066 w1_d, w2_d, w3_d, n, strm_)
1067#elif HAVE_CUDA
1068 call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1069 w1_d, w2_d, w3_d, n, strm_)
1070#elif HAVE_OPENCL
1071 call opencl_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1072 w1_d, w2_d, w3_d, n, strm_)
1073#else
1074 call neko_error('No device backend configured')
1075#endif
1076 end subroutine device_vcross
1077
1078
1080 function device_vlsc3(u_d, v_d, w_d, n, strm) result(res)
1081 type(c_ptr) :: u_d, v_d, w_d
1082 integer :: n
1083 type(c_ptr), optional :: strm
1084 type(c_ptr) :: strm_
1085 real(kind=rp) :: res
1086
1087 res = 0.0_rp
1088
1089 if (n .lt. 1) return
1090
1091 if (present(strm)) then
1092 strm_ = strm
1093 else
1094 strm_ = glb_cmd_queue
1095 end if
1096
1097#if HAVE_HIP
1098 res = hip_vlsc3(u_d, v_d, w_d, n, strm_)
1099#elif HAVE_CUDA
1100 res = cuda_vlsc3(u_d, v_d, w_d, n, strm_)
1101#elif HAVE_OPENCL
1102 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1103 res = opencl_glsc3(u_d, v_d, w_d, n, strm_)
1104#else
1105 call neko_error('No device backend configured')
1106#endif
1107 end function device_vlsc3
1108
1110 function device_glsc3(a_d, b_d, c_d, n, strm) result(res)
1111 type(c_ptr) :: a_d, b_d, c_d
1112 integer :: n, ierr
1113 type(c_ptr), optional :: strm
1114 type(c_ptr) :: strm_
1115 real(kind=rp) :: res
1116
1117 if (present(strm)) then
1118 strm_ = strm
1119 else
1120 strm_ = glb_cmd_queue
1121 end if
1122
1123 res = 0.0_rp
1124#if HAVE_HIP
1125 res = hip_glsc3(a_d, b_d, c_d, n, strm_)
1126#elif HAVE_CUDA
1127 res = cuda_glsc3(a_d, b_d, c_d, n, strm_)
1128#elif HAVE_OPENCL
1129 res = opencl_glsc3(a_d, b_d, c_d, n, strm_)
1130#else
1131 call neko_error('No device backend configured')
1132#endif
1133
1134#ifndef HAVE_DEVICE_MPI
1135 if (pe_size .gt. 1) then
1136 call mpi_allreduce(mpi_in_place, res, 1, &
1137 mpi_real_precision, mpi_sum, neko_comm, ierr)
1138 end if
1139#endif
1140 end function device_glsc3
1141
1142 subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
1143 type(c_ptr), value :: w_d, v_d_d, mult_d
1144 integer(c_int) :: j, n
1145 real(c_rp) :: h(j)
1146 type(c_ptr), optional :: strm
1147 type(c_ptr) :: strm_
1148 integer :: ierr
1149
1150 if (present(strm)) then
1151 strm_ = strm
1152 else
1153 strm_ = glb_cmd_queue
1154 end if
1155
1156#if HAVE_HIP
1157 call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1158#elif HAVE_CUDA
1159 call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1160#elif HAVE_OPENCL
1161 call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1162#else
1163 call neko_error('No device backend configured')
1164#endif
1165
1166#ifndef HAVE_DEVICE_MPI
1167 if (pe_size .gt. 1) then
1168 call mpi_allreduce(mpi_in_place, h, j, &
1169 mpi_real_precision, mpi_sum, neko_comm, ierr)
1170 end if
1171#endif
1172 end subroutine device_glsc3_many
1173
1174 subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
1175 type(c_ptr), value :: y_d, x_d_d, a_d
1176 integer(c_int) :: j, n
1177 type(c_ptr), optional :: strm
1178 type(c_ptr) :: strm_
1179
1180 if (n .lt. 1) return
1181
1182 if (present(strm)) then
1183 strm_ = strm
1184 else
1185 strm_ = glb_cmd_queue
1186 end if
1187
1188#if HAVE_HIP
1189 call hip_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1190#elif HAVE_CUDA
1191 call cuda_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1192#elif HAVE_OPENCL
1193 call opencl_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1194#else
1195 call neko_error('No device backend configured')
1196#endif
1197 end subroutine device_add2s2_many
1198
1200 function device_glsc2(a_d, b_d, n, strm) result(res)
1201 type(c_ptr) :: a_d, b_d
1202 integer :: n, ierr
1203 real(kind=rp) :: res
1204 type(c_ptr), optional :: strm
1205 type(c_ptr) :: strm_
1206
1207 if (present(strm)) then
1208 strm_ = strm
1209 else
1210 strm_ = glb_cmd_queue
1211 end if
1212
1213 res = 0.0_rp
1214#if HAVE_HIP
1215 res = hip_glsc2(a_d, b_d, n, strm_)
1216#elif HAVE_CUDA
1217 res = cuda_glsc2(a_d, b_d, n, strm_)
1218#elif HAVE_OPENCL
1219 res = opencl_glsc2(a_d, b_d, n, strm_)
1220#else
1221 call neko_error('No device backend configured')
1222#endif
1223
1224#ifndef HAVE_DEVICE_MPI
1225 if (pe_size .gt. 1) then
1226 call mpi_allreduce(mpi_in_place, res, 1, &
1227 mpi_real_precision, mpi_sum, neko_comm, ierr)
1228 end if
1229#endif
1230 end function device_glsc2
1231
1234 function device_glsubnorm(a_d, b_d, n, strm) result(res)
1235 type(c_ptr), intent(in) :: a_d, b_d
1236 integer, intent(in) :: n
1237 integer :: ierr
1238 real(kind=rp) :: res
1239 type(c_ptr), optional :: strm
1240 type(c_ptr) :: strm_
1241
1242 if (present(strm)) then
1243 strm_ = strm
1244 else
1245 strm_ = glb_cmd_queue
1246 end if
1247
1248 res = 0.0_rp
1249#if HAVE_HIP
1250 res = hip_glsubnorm2(a_d, b_d, n, strm_)
1251#elif HAVE_CUDA
1252 res = cuda_glsubnorm2(a_d, b_d, n, strm_)
1253#elif HAVE_OPENCL
1254 res = opencl_glsubnorm2(a_d, b_d, n, strm_)
1255#else
1256 call neko_error('No device backend configured')
1257#endif
1258
1259#ifndef HAVE_DEVICE_MPI
1260 if (pe_size .gt. 1) then
1261 call mpi_allreduce(mpi_in_place, res, 1, &
1262 mpi_real_precision, mpi_sum, neko_comm, ierr)
1263 end if
1264#endif
1265
1266 res = sqrt(res)
1267 end function device_glsubnorm
1268
1270 function device_glsum(a_d, n, strm) result(res)
1271 type(c_ptr) :: a_d
1272 integer :: n, ierr
1273 real(kind=rp) :: res
1274 type(c_ptr), optional :: strm
1275 type(c_ptr) :: strm_
1276
1277 if (present(strm)) then
1278 strm_ = strm
1279 else
1280 strm_ = glb_cmd_queue
1281 end if
1282
1283 res = 0.0_rp
1284#if HAVE_HIP
1285 res = hip_glsum(a_d, n, strm_)
1286#elif HAVE_CUDA
1287 res = cuda_glsum(a_d, n, strm_)
1288#elif HAVE_OPENCL
1289 res = opencl_glsum(a_d, n, strm_)
1290#else
1291 call neko_error('No device backend configured')
1292#endif
1293
1294#ifndef HAVE_DEVICE_MPI
1295 if (pe_size .gt. 1) then
1296 call mpi_allreduce(mpi_in_place, res, 1, &
1297 mpi_real_precision, mpi_sum, neko_comm, ierr)
1298 end if
1299#endif
1300 end function device_glsum
1301
1303 function device_glmax(a_d, n, strm) result(res)
1304 type(c_ptr) :: a_d
1305 integer :: n, ierr
1306 real(kind=rp) :: res, ninf
1307 type(c_ptr), optional :: strm
1308 type(c_ptr) :: strm_
1309
1310 if (n .lt. 1) then
1311 res = -huge(0.0_rp)
1312 return
1313 end if
1314
1315 if (present(strm)) then
1316 strm_ = strm
1317 else
1318 strm_ = glb_cmd_queue
1319 end if
1320
1321 ninf = -huge(0.0_rp)
1322#if HAVE_HIP
1323 res = hip_glmax(a_d, ninf, n, strm_)
1324#elif HAVE_CUDA
1325 res = cuda_glmax(a_d, ninf, n, strm_)
1326#elif HAVE_OPENCL
1327 res = opencl_glmax(a_d, n, strm_)
1328#else
1329 call neko_error('No device backend configured')
1330#endif
1331
1332#ifndef HAVE_DEVICE_MPI
1333 if (pe_size .gt. 1) then
1334 call mpi_allreduce(mpi_in_place, res, 1, &
1335 mpi_real_precision, mpi_max, neko_comm, ierr)
1336 end if
1337#endif
1338 end function device_glmax
1339
1341 function device_glmin(a_d, n, strm) result(res)
1342 type(c_ptr) :: a_d
1343 integer :: n, ierr
1344 real(kind=rp) :: res, pinf
1345 type(c_ptr), optional :: strm
1346 type(c_ptr) :: strm_
1347
1348 if (n .lt. 1) then
1349 res = huge(0.0_rp)
1350 return
1351 end if
1352
1353 if (present(strm)) then
1354 strm_ = strm
1355 else
1356 strm_ = glb_cmd_queue
1357 end if
1358
1359 pinf = huge(0.0_rp)
1360#if HAVE_HIP
1361 res = hip_glmin(a_d, pinf, n, strm_)
1362#elif HAVE_CUDA
1363 res = cuda_glmin(a_d, pinf, n, strm_)
1364#elif HAVE_OPENCL
1365 res = opencl_glmin(a_d, n, strm_)
1366#else
1367 call neko_error('No device backend configured')
1368#endif
1369
1370#ifndef HAVE_DEVICE_MPI
1371 if (pe_size .gt. 1) then
1372 call mpi_allreduce(mpi_in_place, res, 1, &
1373 mpi_real_precision, mpi_min, neko_comm, ierr)
1374 end if
1375#endif
1376 end function device_glmin
1377
1378 subroutine device_absval(a_d, n, strm)
1379 integer, intent(in) :: n
1380 type(c_ptr) :: a_d
1381 type(c_ptr), optional :: strm
1382 type(c_ptr) :: strm_
1383
1384 if (n .lt. 1) return
1385
1386 if (present(strm)) then
1387 strm_ = strm
1388 else
1389 strm_ = glb_cmd_queue
1390 end if
1391
1392#ifdef HAVE_HIP
1393 call hip_absval(a_d, n, strm_)
1394#elif HAVE_CUDA
1395 call cuda_absval(a_d, n, strm_)
1396#elif HAVE_OPENCL
1397 call opencl_absval(a_d, n, strm_)
1398#else
1399 call neko_error('No device backend configured')
1400#endif
1401
1402 end subroutine device_absval
1403
1404 ! ========================================================================== !
1405 ! Device point-wise max
1406
1409 subroutine device_pwmax2(a_d, b_d, n, strm)
1410 type(c_ptr) :: a_d, b_d
1411 integer :: n
1412 type(c_ptr), optional :: strm
1413 type(c_ptr) :: strm_
1414
1415 if (n .lt. 1) return
1416
1417 if (present(strm)) then
1418 strm_ = strm
1419 else
1420 strm_ = glb_cmd_queue
1421 end if
1422
1423#if HAVE_HIP
1424 call hip_pwmax_vec2(a_d, b_d, n, strm_)
1425#elif HAVE_CUDA
1426 call cuda_pwmax_vec2(a_d, b_d, n, strm_)
1427#elif HAVE_OPENCL
1428 call opencl_pwmax_vec2(a_d, b_d, n, strm_)
1429#else
1430 call neko_error('No device backend configured')
1431#endif
1432 end subroutine device_pwmax2
1433
1436 subroutine device_pwmax3(a_d, b_d, c_d, n, strm)
1437 type(c_ptr) :: a_d, b_d, c_d
1438 integer :: n
1439 type(c_ptr), optional :: strm
1440 type(c_ptr) :: strm_
1441
1442 if (n .lt. 1) return
1443
1444 if (present(strm)) then
1445 strm_ = strm
1446 else
1447 strm_ = glb_cmd_queue
1448 end if
1449
1450#if HAVE_HIP
1451 call hip_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1452#elif HAVE_CUDA
1453 call cuda_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1454#elif HAVE_OPENCL
1455 call opencl_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1456#else
1457 call neko_error('No device backend configured')
1458#endif
1459
1460 end subroutine device_pwmax3
1461
1464 subroutine device_cpwmax2(a_d, c, n, strm)
1465 type(c_ptr) :: a_d
1466 real(kind=rp), intent(in) :: c
1467 integer :: n
1468 type(c_ptr), optional :: strm
1469 type(c_ptr) :: strm_
1470
1471 if (n .lt. 1) return
1472
1473 if (present(strm)) then
1474 strm_ = strm
1475 else
1476 strm_ = glb_cmd_queue
1477 end if
1478
1479#if HAVE_HIP
1480 call hip_pwmax_sca2(a_d, c, n, strm_)
1481#elif HAVE_CUDA
1482 call cuda_pwmax_sca2(a_d, c, n, strm_)
1483#elif HAVE_OPENCL
1484 call opencl_pwmax_sca2(a_d, c, n, strm_)
1485#else
1486 call neko_error('No device backend configured')
1487#endif
1488
1489 end subroutine device_cpwmax2
1490
1493 subroutine device_cpwmax3(a_d, b_d, c, n, strm)
1494 type(c_ptr) :: a_d, b_d
1495 real(kind=rp), intent(in) :: c
1496 integer :: n
1497 type(c_ptr), optional :: strm
1498 type(c_ptr) :: strm_
1499
1500 if (n .lt. 1) return
1501
1502 if (present(strm)) then
1503 strm_ = strm
1504 else
1505 strm_ = glb_cmd_queue
1506 end if
1507
1508#if HAVE_HIP
1509 call hip_pwmax_sca3(a_d, b_d, c, n, strm_)
1510#elif HAVE_CUDA
1511 call cuda_pwmax_sca3(a_d, b_d, c, n, strm_)
1512#elif HAVE_OPENCL
1513 call opencl_pwmax_sca3(a_d, b_d, c, n, strm_)
1514#else
1515 call neko_error('No device backend configured')
1516#endif
1517
1518 end subroutine device_cpwmax3
1519
1520 ! ========================================================================== !
1521 ! Device point-wise min
1522
1525 subroutine device_pwmin2(a_d, b_d, n, strm)
1526 type(c_ptr) :: a_d, b_d
1527 integer :: n
1528 type(c_ptr), optional :: strm
1529 type(c_ptr) :: strm_
1530
1531 if (n .lt. 1) return
1532
1533 if (present(strm)) then
1534 strm_ = strm
1535 else
1536 strm_ = glb_cmd_queue
1537 end if
1538
1539#if HAVE_HIP
1540 call hip_pwmin_vec2(a_d, b_d, n, strm_)
1541#elif HAVE_CUDA
1542 call cuda_pwmin_vec2(a_d, b_d, n, strm_)
1543#elif HAVE_OPENCL
1544 call opencl_pwmin_vec2(a_d, b_d, n, strm_)
1545#else
1546 call neko_error('No device backend configured')
1547#endif
1548 end subroutine device_pwmin2
1549
1552 subroutine device_pwmin3(a_d, b_d, c_d, n, strm)
1553 type(c_ptr) :: a_d, b_d, c_d
1554 integer :: n
1555 type(c_ptr), optional :: strm
1556 type(c_ptr) :: strm_
1557
1558 if (n .lt. 1) return
1559
1560 if (present(strm)) then
1561 strm_ = strm
1562 else
1563 strm_ = glb_cmd_queue
1564 end if
1565
1566#if HAVE_HIP
1567 call hip_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1568#elif HAVE_CUDA
1569 call cuda_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1570#elif HAVE_OPENCL
1571 call opencl_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1572#else
1573 call neko_error('No device backend configured')
1574#endif
1575
1576 end subroutine device_pwmin3
1577
1580 subroutine device_cpwmin2(a_d, c, n, strm)
1581 type(c_ptr) :: a_d
1582 real(kind=rp), intent(in) :: c
1583 integer :: n
1584 type(c_ptr), optional :: strm
1585 type(c_ptr) :: strm_
1586
1587 if (n .lt. 1) return
1588
1589 if (present(strm)) then
1590 strm_ = strm
1591 else
1592 strm_ = glb_cmd_queue
1593 end if
1594
1595#if HAVE_HIP
1596 call hip_pwmin_sca2(a_d, c, n, strm_)
1597#elif HAVE_CUDA
1598 call cuda_pwmin_sca2(a_d, c, n, strm_)
1599#elif HAVE_OPENCL
1600 call opencl_pwmin_sca2(a_d, c, n, strm_)
1601#else
1602 call neko_error('No device backend configured')
1603#endif
1604
1605 end subroutine device_cpwmin2
1606
1609 subroutine device_cpwmin3(a_d, b_d, c, n, strm)
1610 type(c_ptr) :: a_d, b_d
1611 real(kind=rp), intent(in) :: c
1612 integer :: n
1613 type(c_ptr), optional :: strm
1614 type(c_ptr) :: strm_
1615
1616 if (n .lt. 1) return
1617
1618 if (present(strm)) then
1619 strm_ = strm
1620 else
1621 strm_ = glb_cmd_queue
1622 end if
1623
1624#if HAVE_HIP
1625 call hip_pwmin_sca3(a_d, b_d, c, n, strm_)
1626#elif HAVE_CUDA
1627 call cuda_pwmin_sca3(a_d, b_d, c, n, strm_)
1628#elif HAVE_OPENCL
1629 call opencl_pwmin_sca3(a_d, b_d, c, n, strm_)
1630#else
1631 call neko_error('No device backend configured')
1632#endif
1633
1634 end subroutine device_cpwmin3
1635
1636 ! ========================================================================== !
1637 ! Integer operations
1638
1640 subroutine device_iadd(a_d, c, n, strm)
1641 type(c_ptr), intent(inout) :: a_d
1642 integer, intent(in) :: c
1643 integer, intent(in) :: n
1644 type(c_ptr), optional :: strm
1645 type(c_ptr) :: strm_
1646 if (n .lt. 1) return
1647
1648 if (present(strm)) then
1649 strm_ = strm
1650 else
1651 strm_ = glb_cmd_queue
1652 end if
1653
1654#if HAVE_HIP
1655 call hip_iadd(a_d, c, n, strm_)
1656#elif HAVE_CUDA
1657 call cuda_iadd(a_d, c, n, strm_)
1658#elif HAVE_OPENCL
1659 call opencl_iadd(a_d, c, n, strm_)
1660#else
1661 call neko_error('No device backend configured')
1662#endif
1663 end subroutine device_iadd
1664
1665
1666end module device_math
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:51
integer, public pe_size
MPI size of communicator.
Definition comm.F90:59
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:43
subroutine, public device_pwmin2(a_d, b_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
subroutine, public device_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm)
Returns .
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
real(kind=rp) function, public device_glmax(a_d, n, strm)
Max of a vector of length n.
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_addcol3(a_d, b_d, c_d, n, strm)
Returns .
real(kind=rp) function, public device_glsum(a_d, n, strm)
Sum a vector of length n.
subroutine, public device_pwmax3(a_d, b_d, c_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine, public device_invcol1(a_d, n, strm)
Invert a vector .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
subroutine, public device_cpwmax3(a_d, b_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_rone(a_d, n, strm)
Set all elements to one.
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm)
Compute a cross product (3-d version) assuming vector components etc.
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
real(kind=rp) function, public device_glsubnorm(a_d, b_d, n, strm)
Returns the norm of the difference of two vectors .
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
subroutine, public device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
Fill a constant to a masked vector. .
subroutine, public device_cadd2(a_d, b_d, c, n, strm)
Add a scalar to vector .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
subroutine, public device_invcol3(a_d, b_d, c_d, n, strm)
Vector division .
subroutine, public device_pwmin3(a_d, b_d, c_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_col2(a_d, b_d, n, strm)
Vector multiplication .
subroutine, public device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n, strm)
Compute multiplication sum .
subroutine, public device_cdiv2(a_d, b_d, c, n, strm)
Division of constant c by array .
subroutine, public device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm)
Returns .
subroutine, public device_add4(a_d, b_d, c_d, d_d, n, strm)
subroutine, public device_cpwmax2(a_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_subcol3(a_d, b_d, c_d, n, strm)
Returns .
subroutine, public device_cdiv(a_d, c, n, strm)
Division of constant c by array .
subroutine, public device_absval(a_d, n, strm)
subroutine device_iadd(a_d, c, n, strm)
Add an integer scalar to vector .
subroutine, public device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
subroutine, public device_invcol2(a_d, b_d, n, strm)
Vector division .
subroutine, public device_addsqr2s2(a_d, b_d, c1, n, strm)
Returns .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cpwmin3(a_d, b_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_col3(a_d, b_d, c_d, n, strm)
Vector multiplication with 3 vectors .
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n, strm)
Returns .
subroutine, public device_cfill(a_d, c, n, strm)
Set all elements to a constant c .
subroutine, public device_cpwmin2(a_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_add3(a_d, b_d, c_d, n, strm)
Vector addition .
real(kind=rp) function, public device_glmin(a_d, n, strm)
Min of a vector of length n.
subroutine, public device_addcol3s2(a_d, b_d, c_d, s, n, strm)
Returns .
subroutine, public device_pwmax2(a_d, b_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine device_radd(a_d, c, n, strm)
Add a scalar to vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:51
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35