Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
device_math.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
34 use, intrinsic :: iso_c_binding, only: c_ptr, c_int
35 use num_types, only : rp, c_rp
36 use utils, only : neko_error
38 use mpi_f08, only : mpi_sum, mpi_min, mpi_max, mpi_in_place, mpi_allreduce
39 use device, only : glb_cmd_queue
40 ! ========================================================================== !
41 ! Device math interfaces
42
43 use hip_math
44 use cuda_math
45 use opencl_math
46 use metal_math
47
48 implicit none
49 private
50
51 interface device_cadd
52 module procedure device_radd, device_iadd
53 end interface device_cadd
54
72
73contains
74
76 subroutine device_copy(a_d, b_d, n, strm)
77 type(c_ptr) :: a_d, b_d
78 integer :: n
79 type(c_ptr), optional :: strm
80 type(c_ptr) :: strm_
81
82 if (n .lt. 1) return
83
84 if (present(strm)) then
85 strm_ = strm
86 else
87 strm_ = glb_cmd_queue
88 end if
89
90#if HAVE_HIP
91 call hip_copy(a_d, b_d, n, strm_)
92#elif HAVE_CUDA
93 call cuda_copy(a_d, b_d, n, strm_)
94#elif HAVE_OPENCL
95 call opencl_copy(a_d, b_d, n, strm_)
96#elif HAVE_METAL
97 call metal_copy(a_d, b_d, n, strm_)
98#else
99 call neko_error('no device backend configured')
100#endif
101 end subroutine device_copy
102
105 subroutine device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
106 type(c_ptr) :: a_d, b_d, mask_d
107 integer :: n, n_mask
108 type(c_ptr), optional :: strm
109 type(c_ptr) :: strm_
110
111 if (n .lt. 1 .or. n_mask .lt. 1) return
112
113 if (present(strm)) then
114 strm_ = strm
115 else
116 strm_ = glb_cmd_queue
117 end if
118
119#if HAVE_HIP
120 call hip_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm_)
121#elif HAVE_CUDA
122 call cuda_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm_)
123#elif HAVE_OPENCL
124 call opencl_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm_)
125#elif HAVE_METAL
126 call metal_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm_)
127#else
128 call neko_error('no device backend configured')
129#endif
130 end subroutine device_masked_copy_0
131
134 subroutine device_masked_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
135 type(c_ptr) :: a_d, b_d, mask_d
136 integer :: n, n_mask
137 type(c_ptr), optional :: strm
138 type(c_ptr) :: strm_
139
140 if (n .lt. 1 .or. n_mask .lt. 1) return
141
142 if (present(strm)) then
143 strm_ = strm
144 else
145 strm_ = glb_cmd_queue
146 end if
147
148#if HAVE_HIP
149 call hip_masked_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
150#elif HAVE_CUDA
151 call cuda_masked_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
152#elif HAVE_OPENCL
153 call opencl_masked_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
154#else
155 call neko_error('no device backend configured')
156#endif
157 end subroutine device_masked_copy_aligned
158
160 subroutine device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
161 type(c_ptr) :: a_d, b_d, mask_d
162 integer :: n, n_mask
163 type(c_ptr), optional :: strm
164 type(c_ptr) :: strm_
165
166 if (n .lt. 1 .or. n_mask .lt. 1) return
167
168 if (present(strm)) then
169 strm_ = strm
170 else
171 strm_ = glb_cmd_queue
172 end if
173
174#if HAVE_HIP
175 call hip_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
176#elif HAVE_CUDA
177 call cuda_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
178#elif HAVE_OPENCL
179 call opencl_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
180#elif HAVE_METAL
181 call metal_masked_gather_copy(a_d, b_d, mask_d, n, n_mask, strm_)
182#else
183 call neko_error('no device backend configured')
184#endif
185 end subroutine device_masked_gather_copy_0
186
188 subroutine device_face_masked_gather_copy_0(a_d, b_d, mask_d, facet_d, n1, &
189 n2, lx, ly, lz, n_mask, strm)
190 type(c_ptr) :: a_d, b_d, mask_d, facet_d
191 integer :: n1, n2, lx, ly, lz, n_mask
192 type(c_ptr), optional :: strm
193 type(c_ptr) :: strm_
194
195 if (n_mask .lt. 1) return
196
197 if (present(strm)) then
198 strm_ = strm
199 else
200 strm_ = glb_cmd_queue
201 end if
202
203#if HAVE_HIP
204 call hip_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, lx, &
205 ly, lz, n_mask, strm_)
206#elif HAVE_CUDA
207 call cuda_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, lx, &
208 ly, lz, n_mask, strm_)
209#elif HAVE_OPENCL
210 call opencl_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, &
211 lx, ly, lz, n_mask, strm_)
212#elif HAVE_METAL
213 call metal_face_masked_gather_copy(a_d, b_d, mask_d, facet_d, n1, n2, &
214 lx, ly, lz, n_mask, strm_)
215#else
216 call neko_error('no device backend configured')
217#endif
219
221 ! In this case, the mask comes from a mask_t type
222 subroutine device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
223 type(c_ptr) :: a_d, b_d, mask_d
224 integer :: n, n_mask
225 type(c_ptr), optional :: strm
226 type(c_ptr) :: strm_
227
228 if (n .lt. 1 .or. n_mask .lt. 1) return
229
230 if (present(strm)) then
231 strm_ = strm
232 else
233 strm_ = glb_cmd_queue
234 end if
235
236#if HAVE_HIP
237 call hip_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
238#elif HAVE_CUDA
239 call cuda_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
240#elif HAVE_OPENCL
241 call opencl_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
242#elif HAVE_METAL
243 call metal_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
244#else
245 call neko_error('no device backend configured')
246#endif
248
250 subroutine device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
251 type(c_ptr) :: a_d, b_d, mask_d
252 integer :: n, n_mask
253 type(c_ptr), optional :: strm
254 type(c_ptr) :: strm_
255
256 if (n .lt. 1 .or. n_mask .lt. 1) return
257
258 if (present(strm)) then
259 strm_ = strm
260 else
261 strm_ = glb_cmd_queue
262 end if
263
264#if HAVE_HIP
265 call hip_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
266#elif HAVE_CUDA
267 call cuda_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
268#elif HAVE_OPENCL
269 call opencl_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
270#elif HAVE_METAL
271 call metal_masked_scatter_copy(a_d, b_d, mask_d, n, n_mask, strm_)
272#else
273 call neko_error('no device backend configured')
274#endif
275 end subroutine device_masked_scatter_copy_0
276
278 ! In this case, the mask comes from a mask_t type
279 subroutine device_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
280 type(c_ptr) :: a_d, b_d, mask_d
281 integer :: n, n_mask
282 type(c_ptr), optional :: strm
283 type(c_ptr) :: strm_
284
285 if (n .lt. 1 .or. n_mask .lt. 1) return
286
287 if (present(strm)) then
288 strm_ = strm
289 else
290 strm_ = glb_cmd_queue
291 end if
292
293#if HAVE_HIP
294 call hip_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
295#elif HAVE_CUDA
296 call cuda_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
297#elif HAVE_OPENCL
298 call opencl_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
299#elif HAVE_METAL
300 call metal_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm_)
301#else
302 call neko_error('no device backend configured')
303#endif
305
306 subroutine device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
307 type(c_ptr) :: a_d, b_d, mask_d
308 integer :: n, n_mask
309 type(c_ptr), optional :: strm
310 type(c_ptr) :: strm_
311
312 if (n .lt. 1 .or. n_mask .lt. 1) return
313
314 if (present(strm)) then
315 strm_ = strm
316 else
317 strm_ = glb_cmd_queue
318 end if
319
320#if HAVE_HIP
321 call hip_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
322#elif HAVE_CUDA
323 call cuda_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
324#elif HAVE_OPENCL
325 call neko_error('No OpenCL bcknd, masked atomic reduction')
326#elif HAVE_METAL
327 call metal_masked_atomic_reduction(a_d, b_d, mask_d, n, n_mask, strm_)
328#else
329 call neko_error('no device backend configured')
330#endif
332
335 subroutine device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
336 type(c_ptr) :: a_d
337 real(kind=rp), intent(in) :: c
338 integer :: n
339 type(c_ptr) :: mask_d
340 integer :: n_mask
341 type(c_ptr), optional :: strm
342 type(c_ptr) :: strm_
343
344 if (n .lt. 1 .or. n_mask .lt. 1) return
345
346 if (present(strm)) then
347 strm_ = strm
348 else
349 strm_ = glb_cmd_queue
350 end if
351
352#if HAVE_HIP
353 call hip_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
354#elif HAVE_CUDA
355 call cuda_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
356#elif HAVE_OPENCL
357 call opencl_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
358#elif HAVE_METAL
359 call metal_cfill_mask(a_d, c, n, mask_d, n_mask, strm_)
360#else
361 call neko_error('No device backend configured')
362#endif
363 end subroutine device_cfill_mask
364
366 subroutine device_rzero(a_d, n, strm)
367 type(c_ptr) :: a_d
368 integer :: n
369 type(c_ptr), optional :: strm
370 type(c_ptr) :: strm_
371
372 if (n .lt. 1) return
373
374 if (present(strm)) then
375 strm_ = strm
376 else
377 strm_ = glb_cmd_queue
378 end if
379
380#if HAVE_HIP
381 call hip_rzero(a_d, n, strm_)
382#elif HAVE_CUDA
383 call cuda_rzero(a_d, n, strm_)
384#elif HAVE_OPENCL
385 call opencl_rzero(a_d, n, strm_)
386#elif HAVE_METAL
387 call metal_rzero(a_d, n, strm_)
388#else
389 call neko_error('No device backend configured')
390#endif
391 end subroutine device_rzero
392
394 subroutine device_rone(a_d, n, strm)
395 type(c_ptr) :: a_d
396 integer :: n
397 type(c_ptr), optional :: strm
398 type(c_ptr) :: strm_
399 real(kind=rp), parameter :: one = 1.0_rp
400
401 if (n .lt. 1) return
402
403 if (present(strm)) then
404 strm_ = strm
405 else
406 strm_ = glb_cmd_queue
407 end if
408
409#if HAVE_HIP || HAVE_CUDA || HAVE_OPENCL || HAVE_METAL
410 call device_cfill(a_d, one, n, strm_)
411#else
412 call neko_error('No device backend configured')
413#endif
414 end subroutine device_rone
415
417 subroutine device_cmult(a_d, c, n, strm)
418 type(c_ptr) :: a_d
419 real(kind=rp), intent(in) :: c
420 integer :: n
421 type(c_ptr), optional :: strm
422 type(c_ptr) :: strm_
423
424 if (n .lt. 1) return
425
426 if (present(strm)) then
427 strm_ = strm
428 else
429 strm_ = glb_cmd_queue
430 end if
431
432#if HAVE_HIP
433 call hip_cmult(a_d, c, n, strm_)
434#elif HAVE_CUDA
435 call cuda_cmult(a_d, c, n, strm_)
436#elif HAVE_OPENCL
437 call opencl_cmult(a_d, c, n, strm_)
438#elif HAVE_METAL
439 call metal_cmult(a_d, c, n, strm_)
440#else
441 call neko_error('No device backend configured')
442#endif
443 end subroutine device_cmult
444
446 subroutine device_cmult2(a_d, b_d, c, n, strm)
447 type(c_ptr) :: a_d, b_d
448 real(kind=rp), intent(in) :: c
449 integer :: n
450 type(c_ptr), optional :: strm
451 type(c_ptr) :: strm_
452
453 if (n .lt. 1) return
454
455 if (present(strm)) then
456 strm_ = strm
457 else
458 strm_ = glb_cmd_queue
459 end if
460
461#if HAVE_HIP
462 call hip_cmult2(a_d, b_d, c, n, strm_)
463#elif HAVE_CUDA
464 call cuda_cmult2(a_d, b_d, c, n, strm_)
465#elif HAVE_OPENCL
466 call opencl_cmult2(a_d, b_d, c, n, strm_)
467#elif HAVE_METAL
468 call metal_cmult2(a_d, b_d, c, n, strm_)
469#else
470 call neko_error('No device backend configured')
471#endif
472 end subroutine device_cmult2
473
475 subroutine device_cdiv(a_d, c, n, strm)
476 type(c_ptr) :: a_d
477 real(kind=rp), intent(in) :: c
478 integer :: n
479 type(c_ptr), optional :: strm
480 type(c_ptr) :: strm_
481
482 if (present(strm)) then
483 strm_ = strm
484 else
485 strm_ = glb_cmd_queue
486 end if
487
488#if HAVE_HIP
489 call hip_cdiv(a_d, c, n, strm_)
490#elif HAVE_CUDA
491 call cuda_cdiv(a_d, c, n, strm_)
492#elif HAVE_OPENCL
493 call opencl_cdiv(a_d, c, n, strm_)
494#elif HAVE_METAL
495 call metal_cdiv(a_d, c, n, strm_)
496#else
497 call neko_error('No device backend configured')
498#endif
499 end subroutine device_cdiv
500
502 subroutine device_cdiv2(a_d, b_d, c, n, strm)
503 type(c_ptr) :: a_d, b_d
504 real(kind=rp), intent(in) :: c
505 integer :: n
506 type(c_ptr), optional :: strm
507 type(c_ptr) :: strm_
508
509 if (present(strm)) then
510 strm_ = strm
511 else
512 strm_ = glb_cmd_queue
513 end if
514
515#if HAVE_HIP
516 call hip_cdiv2(a_d, b_d, c, n, strm_)
517#elif HAVE_CUDA
518 call cuda_cdiv2(a_d, b_d, c, n, strm_)
519#elif HAVE_OPENCL
520 call opencl_cdiv2(a_d, b_d, c, n, strm_)
521#elif HAVE_METAL
522 call metal_cdiv2(a_d, b_d, c, n, strm_)
523#else
524 call neko_error('No device backend configured')
525#endif
526 end subroutine device_cdiv2
527
529 subroutine device_radd(a_d, c, n, strm)
530 type(c_ptr) :: a_d
531 real(kind=rp), intent(in) :: c
532 integer :: n
533 type(c_ptr), optional :: strm
534 type(c_ptr) :: strm_
535
536 if (n .lt. 1) return
537
538 if (present(strm)) then
539 strm_ = strm
540 else
541 strm_ = glb_cmd_queue
542 end if
543
544#if HAVE_HIP
545 call hip_radd(a_d, c, n, strm_)
546#elif HAVE_CUDA
547 call cuda_radd(a_d, c, n, strm_)
548#elif HAVE_OPENCL
549 call opencl_radd(a_d, c, n, strm_)
550#elif HAVE_METAL
551 call metal_radd(a_d, c, n, strm_)
552#else
553 call neko_error('No device backend configured')
554#endif
555 end subroutine device_radd
556
558 subroutine device_cadd2(a_d, b_d, c, n, strm)
559 type(c_ptr) :: a_d
560 type(c_ptr) :: b_d
561 real(kind=rp), intent(in) :: c
562 integer :: n
563 type(c_ptr), optional :: strm
564 type(c_ptr) :: strm_
565
566 if (n .lt. 1) return
567
568 if (present(strm)) then
569 strm_ = strm
570 else
571 strm_ = glb_cmd_queue
572 end if
573
574#if HAVE_HIP
575 call hip_cadd2(a_d, b_d, c, n, strm_)
576#elif HAVE_CUDA
577 call cuda_cadd2(a_d, b_d, c, n, strm_)
578#elif HAVE_OPENCL
579 call opencl_cadd2(a_d, b_d, c, n, strm_)
580#elif HAVE_METAL
581 call metal_cadd2(a_d, b_d, c, n, strm_)
582#else
583 call neko_error('No device backend configured')
584#endif
585 end subroutine device_cadd2
586
588 subroutine device_cwrap(a_d, min_val, max_val, n, strm)
589 type(c_ptr) :: a_d
590 real(kind=rp), intent(in) :: min_val, max_val
591 integer :: n
592 type(c_ptr), optional :: strm
593 type(c_ptr) :: strm_
594
595 if (n .lt. 1 .or. max_val .le. min_val) return
596
597 if (present(strm)) then
598 strm_ = strm
599 else
600 strm_ = glb_cmd_queue
601 end if
602
603#if HAVE_HIP
604 call hip_cwrap(a_d, min_val, max_val, n, strm_)
605#elif HAVE_CUDA
606 call cuda_cwrap(a_d, min_val, max_val, n, strm_)
607#elif HAVE_OPENCL
608 call opencl_cwrap(a_d, min_val, max_val, n, strm_)
609#elif HAVE_METAL
610 call metal_cwrap(a_d, min_val, max_val, n, strm_)
611#else
612 call neko_error('No device backend configured')
613#endif
614 end subroutine device_cwrap
615
617 subroutine device_cfill(a_d, c, n, strm)
618 type(c_ptr) :: a_d
619 real(kind=rp), intent(in) :: c
620 integer :: n
621 type(c_ptr), optional ::strm
622 type(c_ptr) :: strm_
623
624 if (n .lt. 1) return
625
626 if (present(strm)) then
627 strm_ = strm
628 else
629 strm_ = glb_cmd_queue
630 end if
631
632#if HAVE_HIP
633 call hip_cfill(a_d, c, n, strm_)
634#elif HAVE_CUDA
635 call cuda_cfill(a_d, c, n, strm_)
636#elif HAVE_OPENCL
637 call opencl_cfill(a_d, c, n, strm_)
638#elif HAVE_METAL
639 call metal_cfill(a_d, c, n, strm_)
640#else
641 call neko_error('No device backend configured')
642#endif
643 end subroutine device_cfill
644
646 subroutine device_add2(a_d, b_d, n, strm)
647 type(c_ptr) :: a_d, b_d
648 integer :: n
649 type(c_ptr), optional :: strm
650 type(c_ptr) :: strm_
651
652 if (n .lt. 1) return
653
654 if (present(strm)) then
655 strm_ = strm
656 else
657 strm_ = glb_cmd_queue
658 end if
659
660#if HAVE_HIP
661 call hip_add2(a_d, b_d, n, strm_)
662#elif HAVE_CUDA
663 call cuda_add2(a_d, b_d, n, strm_)
664#elif HAVE_OPENCL
665 call opencl_add2(a_d, b_d, n, strm_)
666#elif HAVE_METAL
667 call metal_add2(a_d, b_d, n, strm_)
668#else
669 call neko_error('No device backend configured')
670#endif
671 end subroutine device_add2
672
673 subroutine device_add4(a_d, b_d, c_d, d_d, n, strm)
674 type(c_ptr) :: a_d, b_d, c_d, d_d
675 integer :: n
676 type(c_ptr), optional :: strm
677 type(c_ptr) :: strm_
678
679 if (n .lt. 1) return
680
681 if (present(strm)) then
682 strm_ = strm
683 else
684 strm_ = glb_cmd_queue
685 end if
686
687#if HAVE_HIP
688 call hip_add4(a_d, b_d, c_d, d_d, n, strm_)
689#elif HAVE_CUDA
690 call cuda_add4(a_d, b_d, c_d, d_d, n, strm_)
691#elif HAVE_OPENCL
692 call opencl_add4(a_d, b_d, c_d, d_d, n, strm_)
693#elif HAVE_METAL
694 call metal_add4(a_d, b_d, c_d, d_d, n, strm_)
695#else
696 call neko_error('No device backend configured')
697#endif
698 end subroutine device_add4
699
700 subroutine device_add2s1(a_d, b_d, c1, n, strm)
701 type(c_ptr) :: a_d, b_d
702 real(kind=rp) :: c1
703 integer :: n
704 type(c_ptr), optional :: strm
705 type(c_ptr) :: strm_
706
707 if (n .lt. 1) return
708
709 if (present(strm)) then
710 strm_ = strm
711 else
712 strm_ = glb_cmd_queue
713 end if
714
715#if HAVE_HIP
716 call hip_add2s1(a_d, b_d, c1, n, strm_)
717#elif HAVE_CUDA
718 call cuda_add2s1(a_d, b_d, c1, n, strm_)
719#elif HAVE_OPENCL
720 call opencl_add2s1(a_d, b_d, c1, n, strm_)
721#elif HAVE_METAL
722 call metal_add2s1(a_d, b_d, c1, n, strm_)
723#else
724 call neko_error('No device backend configured')
725#endif
726 end subroutine device_add2s1
727
730 subroutine device_add2s2(a_d, b_d, c1, n, strm)
731 type(c_ptr) :: a_d, b_d
732 real(kind=rp) :: c1
733 integer :: n
734 type(c_ptr), optional :: strm
735 type(c_ptr) :: strm_
736
737 if (n .lt. 1) return
738
739 if (present(strm)) then
740 strm_ = strm
741 else
742 strm_ = glb_cmd_queue
743 end if
744
745#if HAVE_HIP
746 call hip_add2s2(a_d, b_d, c1, n, strm_)
747#elif HAVE_CUDA
748 call cuda_add2s2(a_d, b_d, c1, n, strm_)
749#elif HAVE_OPENCL
750 call opencl_add2s2(a_d, b_d, c1, n, strm_)
751#elif HAVE_METAL
752 call metal_add2s2(a_d, b_d, c1, n, strm_)
753#else
754 call neko_error('No device backend configured')
755#endif
756 end subroutine device_add2s2
757
759 subroutine device_addsqr2s2(a_d, b_d, c1, n, strm)
760 type(c_ptr) :: a_d, b_d
761 real(kind=rp) :: c1
762 integer :: n
763 type(c_ptr), optional :: strm
764 type(c_ptr) :: strm_
765
766 if (n .lt. 1) return
767
768 if (present(strm)) then
769 strm_ = strm
770 else
771 strm_ = glb_cmd_queue
772 end if
773
774#if HAVE_HIP
775 call hip_addsqr2s2(a_d, b_d, c1, n, strm_)
776#elif HAVE_CUDA
777 call cuda_addsqr2s2(a_d, b_d, c1, n, strm_)
778#elif HAVE_OPENCL
779 call opencl_addsqr2s2(a_d, b_d, c1, n, strm_)
780#elif HAVE_METAL
781 call metal_addsqr2s2(a_d, b_d, c1, n, strm_)
782#else
783 call neko_error('No device backend configured')
784#endif
785 end subroutine device_addsqr2s2
786
788 subroutine device_add3(a_d, b_d, c_d, n, strm)
789 type(c_ptr) :: a_d, b_d, c_d
790 integer :: n
791 type(c_ptr), optional :: strm
792 type(c_ptr) :: strm_
793
794 if (n .lt. 1) return
795
796 if (present(strm)) then
797 strm_ = strm
798 else
799 strm_ = glb_cmd_queue
800 end if
801
802#if HAVE_HIP
803 call hip_add3(a_d, b_d, c_d, n, strm_)
804#elif HAVE_CUDA
805 call cuda_add3(a_d, b_d, c_d, n, strm_)
806#elif HAVE_OPENCL
807 call opencl_add3(a_d, b_d, c_d, n, strm_)
808#elif HAVE_METAL
809 call metal_add3(a_d, b_d, c_d, n, strm_)
810#else
811 call neko_error('No device backend configured')
812#endif
813 end subroutine device_add3
814
816 subroutine device_add3s2(a_d, b_d, c_d, c1, c2 , n, strm)
817 type(c_ptr) :: a_d, b_d, c_d
818 real(kind=rp) :: c1, c2
819 integer :: n
820 type(c_ptr), optional :: strm
821 type(c_ptr) :: strm_
822
823 if (n .lt. 1) return
824
825 if (present(strm)) then
826 strm_ = strm
827 else
828 strm_ = glb_cmd_queue
829 end if
830
831#if HAVE_HIP
832 call hip_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
833#elif HAVE_CUDA
834 call cuda_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
835#elif HAVE_OPENCL
836 call opencl_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
837#elif HAVE_METAL
838 call metal_add3s2(a_d, b_d, c_d, c1, c2, n, strm_)
839#else
840 call neko_error('No device backend configured')
841#endif
842 end subroutine device_add3s2
843
845 subroutine device_add4s3(a_d, b_d, c_d, d_d, c1, c2 , c3, n, strm)
846 type(c_ptr) :: a_d, b_d, c_d, d_d
847 real(kind=rp) :: c1, c2, c3
848 integer :: n
849 type(c_ptr), optional :: strm
850 type(c_ptr) :: strm_
851
852 if (n .lt. 1) return
853
854 if (present(strm)) then
855 strm_ = strm
856 else
857 strm_ = glb_cmd_queue
858 end if
859
860#if HAVE_HIP
861 call hip_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
862#elif HAVE_CUDA
863 call cuda_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
864#elif HAVE_OPENCL
865 call opencl_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
866#elif HAVE_METAL
867 call metal_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm_)
868#else
869 call neko_error('No device backend configured')
870#endif
871 end subroutine device_add4s3
872
874 subroutine device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2 , c3, c4, n, strm)
875 type(c_ptr) :: a_d, b_d, c_d, d_d, e_d
876 real(kind=rp) :: c1, c2, c3, c4
877 integer :: n
878 type(c_ptr), optional :: strm
879 type(c_ptr) :: strm_
880
881 if (n .lt. 1) return
882
883 if (present(strm)) then
884 strm_ = strm
885 else
886 strm_ = glb_cmd_queue
887 end if
888
889#if HAVE_HIP
890 call hip_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
891#elif HAVE_CUDA
892 call cuda_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
893#elif HAVE_OPENCL
894 call opencl_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
895#elif HAVE_METAL
896 call metal_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm_)
897#else
898 call neko_error('No device backend configured')
899#endif
900 end subroutine device_add5s4
901
903 subroutine device_invcol1(a_d, n, strm)
904 type(c_ptr) :: a_d
905 integer :: n
906 type(c_ptr), optional :: strm
907 type(c_ptr) :: strm_
908
909 if (n .lt. 1) return
910
911 if (present(strm)) then
912 strm_ = strm
913 else
914 strm_ = glb_cmd_queue
915 end if
916
917#if HAVE_HIP
918 call hip_invcol1(a_d, n, strm_)
919#elif HAVE_CUDA
920 call cuda_invcol1(a_d, n, strm_)
921#elif HAVE_OPENCL
922 call opencl_invcol1(a_d, n, strm_)
923#elif HAVE_METAL
924 call metal_invcol1(a_d, n, strm_)
925#else
926 call neko_error('No device backend configured')
927#endif
928 end subroutine device_invcol1
929
931 subroutine device_invcol2(a_d, b_d, n, strm)
932 type(c_ptr) :: a_d, b_d
933 integer :: n
934 type(c_ptr), optional :: strm
935 type(c_ptr) :: strm_
936
937 if (n .lt. 1) return
938
939 if (present(strm)) then
940 strm_ = strm
941 else
942 strm_ = glb_cmd_queue
943 end if
944
945#if HAVE_HIP
946 call hip_invcol2(a_d, b_d, n, strm_)
947#elif HAVE_CUDA
948 call cuda_invcol2(a_d, b_d, n, strm_)
949#elif HAVE_OPENCL
950 call opencl_invcol2(a_d, b_d, n, strm_)
951#elif HAVE_METAL
952 call metal_invcol2(a_d, b_d, n, strm_)
953#else
954 call neko_error('No device backend configured')
955#endif
956 end subroutine device_invcol2
957
959 subroutine device_invcol3(a_d, b_d, c_d, n, strm)
960 type(c_ptr) :: a_d, b_d, c_d
961 integer :: n
962 type(c_ptr), optional :: strm
963 type(c_ptr) :: strm_
964
965 if (present(strm)) then
966 strm_ = strm
967 else
968 strm_ = glb_cmd_queue
969 end if
970
971#ifdef HAVE_HIP
972 call hip_invcol3(a_d, b_d, c_d, n, strm_)
973#elif HAVE_CUDA
974 call cuda_invcol3(a_d, b_d, c_d, n, strm_)
975#elif HAVE_OPENCL
976 ! call opencl_invcol3(a_d, b_d, c_d, n)
977 call neko_error('opencl_invcol3 not implemented')
978#elif HAVE_METAL
979 call metal_invcol3(a_d, b_d, c_d, n, strm_)
980#else
981 call neko_error('No device backend configured')
982#endif
983 end subroutine device_invcol3
984
986 subroutine device_col2(a_d, b_d, n, strm)
987 type(c_ptr) :: a_d, b_d
988 integer :: n
989 type(c_ptr), optional :: strm
990 type(c_ptr) :: strm_
991
992 if (present(strm)) then
993 strm_ = strm
994 else
995 strm_ = glb_cmd_queue
996 end if
997
998 if (n .lt. 1) return
999#if HAVE_HIP
1000 call hip_col2(a_d, b_d, n, strm_)
1001#elif HAVE_CUDA
1002 call cuda_col2(a_d, b_d, n, strm_)
1003#elif HAVE_OPENCL
1004 call opencl_col2(a_d, b_d, n, strm_)
1005#elif HAVE_METAL
1006 call metal_col2(a_d, b_d, n, strm_)
1007#else
1008 call neko_error('No device backend configured')
1009#endif
1010 end subroutine device_col2
1011
1013 subroutine device_col3(a_d, b_d, c_d, n, strm)
1014 type(c_ptr) :: a_d, b_d, c_d
1015 integer :: n
1016 type(c_ptr), optional :: strm
1017 type(c_ptr) :: strm_
1018
1019 if (n .lt. 1) return
1020
1021 if (present(strm)) then
1022 strm_ = strm
1023 else
1024 strm_ = glb_cmd_queue
1025 end if
1026
1027#if HAVE_HIP
1028 call hip_col3(a_d, b_d, c_d, n, strm_)
1029#elif HAVE_CUDA
1030 call cuda_col3(a_d, b_d, c_d, n, strm_)
1031#elif HAVE_OPENCL
1032 call opencl_col3(a_d, b_d, c_d, n, strm_)
1033#elif HAVE_METAL
1034 call metal_col3(a_d, b_d, c_d, n, strm_)
1035#else
1036 call neko_error('No device backend configured')
1037#endif
1038 end subroutine device_col3
1039
1041 subroutine device_subcol3(a_d, b_d, c_d, n, strm)
1042 type(c_ptr) :: a_d, b_d, c_d
1043 integer :: n
1044 type(c_ptr), optional :: strm
1045 type(c_ptr) :: strm_
1046
1047 if (n .lt. 1) return
1048
1049 if (present(strm)) then
1050 strm_ = strm
1051 else
1052 strm_ = glb_cmd_queue
1053 end if
1054
1055#if HAVE_HIP
1056 call hip_subcol3(a_d, b_d, c_d, n, strm_)
1057#elif HAVE_CUDA
1058 call cuda_subcol3(a_d, b_d, c_d, n, strm_)
1059#elif HAVE_OPENCL
1060 call opencl_subcol3(a_d, b_d, c_d, n, strm_)
1061#elif HAVE_METAL
1062 call metal_subcol3(a_d, b_d, c_d, n, strm_)
1063#else
1064 call neko_error('No device backend configured')
1065#endif
1066 end subroutine device_subcol3
1067
1069 subroutine device_sub2(a_d, b_d, n, strm)
1070 type(c_ptr) :: a_d, b_d
1071 integer :: n
1072 type(c_ptr), optional :: strm
1073 type(c_ptr) :: strm_
1074
1075 if (n .lt. 1) return
1076
1077 if (present(strm)) then
1078 strm_ = strm
1079 else
1080 strm_ = glb_cmd_queue
1081 end if
1082
1083#if HAVE_HIP
1084 call hip_sub2(a_d, b_d, n, strm_)
1085#elif HAVE_CUDA
1086 call cuda_sub2(a_d, b_d, n, strm_)
1087#elif HAVE_OPENCL
1088 call opencl_sub2(a_d, b_d, n, strm_)
1089#elif HAVE_METAL
1090 call metal_sub2(a_d, b_d, n, strm_)
1091#else
1092 call neko_error('No device backend configured')
1093#endif
1094 end subroutine device_sub2
1095
1097 subroutine device_sub3(a_d, b_d, c_d, n, strm)
1098 type(c_ptr) :: a_d, b_d, c_d
1099 integer :: n
1100 type(c_ptr), optional :: strm
1101 type(c_ptr) :: strm_
1102
1103 if (n .lt. 1) return
1104
1105 if (present(strm)) then
1106 strm_ = strm
1107 else
1108 strm_ = glb_cmd_queue
1109 end if
1110
1111#if HAVE_HIP
1112 call hip_sub3(a_d, b_d, c_d, n, strm_)
1113#elif HAVE_CUDA
1114 call cuda_sub3(a_d, b_d, c_d, n, strm_)
1115#elif HAVE_OPENCL
1116 call opencl_sub3(a_d, b_d, c_d, n, strm_)
1117#elif HAVE_METAL
1118 call metal_sub3(a_d, b_d, c_d, n, strm_)
1119#else
1120 call neko_error('No device backend configured')
1121#endif
1122 end subroutine device_sub3
1123
1125 subroutine device_addcol3(a_d, b_d, c_d, n, strm)
1126 type(c_ptr) :: a_d, b_d, c_d
1127 integer :: n
1128 type(c_ptr), optional :: strm
1129 type(c_ptr) :: strm_
1130
1131 if (n .lt. 1) return
1132
1133 if (present(strm)) then
1134 strm_ = strm
1135 else
1136 strm_ = glb_cmd_queue
1137 end if
1138
1139#if HAVE_HIP
1140 call hip_addcol3(a_d, b_d, c_d, n, strm_)
1141#elif HAVE_CUDA
1142 call cuda_addcol3(a_d, b_d, c_d, n, strm_)
1143#elif HAVE_OPENCL
1144 call opencl_addcol3(a_d, b_d, c_d, n, strm_)
1145#elif HAVE_METAL
1146 call metal_addcol3(a_d, b_d, c_d, n, strm_)
1147#else
1148 call neko_error('No device backend configured')
1149#endif
1150 end subroutine device_addcol3
1151
1153 subroutine device_addcol4(a_d, b_d, c_d, d_d, n, strm)
1154 type(c_ptr) :: a_d, b_d, c_d, d_d
1155 integer :: n
1156 type(c_ptr), optional :: strm
1157 type(c_ptr) :: strm_
1158
1159 if (n .lt. 1) return
1160
1161 if (present(strm)) then
1162 strm_ = strm
1163 else
1164 strm_ = glb_cmd_queue
1165 end if
1166
1167#if HAVE_HIP
1168 call hip_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1169#elif HAVE_CUDA
1170 call cuda_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1171#elif HAVE_OPENCL
1172 call opencl_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1173#elif HAVE_METAL
1174 call metal_addcol4(a_d, b_d, c_d, d_d, n, strm_)
1175#else
1176 call neko_error('No device backend configured')
1177#endif
1178 end subroutine device_addcol4
1179
1181 subroutine device_addcol3s2(a_d, b_d, c_d, s, n, strm)
1182 type(c_ptr) :: a_d, b_d, c_d
1183 real(kind=rp) :: s
1184 integer :: n
1185 type(c_ptr), optional :: strm
1186 type(c_ptr) :: strm_
1187
1188 if (n .lt. 1) return
1189
1190 if (present(strm)) then
1191 strm_ = strm
1192 else
1193 strm_ = glb_cmd_queue
1194 end if
1195
1196#if HAVE_HIP
1197 call hip_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1198#elif HAVE_CUDA
1199 call cuda_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1200#elif HAVE_OPENCL
1201 call opencl_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1202#elif HAVE_METAL
1203 call metal_addcol3s2(a_d, b_d, c_d, s, n, strm_)
1204#else
1205 call neko_error('No device backend configured')
1206#endif
1207 end subroutine device_addcol3s2
1208
1211 subroutine device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
1212 type(c_ptr) :: dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d
1213 integer :: n
1214 type(c_ptr), optional :: strm
1215 type(c_ptr) :: strm_
1216
1217 if (n .lt. 1) return
1218
1219 if (present(strm)) then
1220 strm_ = strm
1221 else
1222 strm_ = glb_cmd_queue
1223 end if
1224
1225#if HAVE_HIP
1226 call hip_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1227#elif HAVE_CUDA
1228 call cuda_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1229#elif HAVE_OPENCL
1230 call opencl_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1231#elif HAVE_METAL
1232 call metal_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm_)
1233#else
1234 call neko_error('No device backend configured')
1235#endif
1236 end subroutine device_vdot3
1237
1240 subroutine device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1241 w1_d, w2_d, w3_d, n, strm)
1242 type(c_ptr) :: u1_d, u2_d, u3_d
1243 type(c_ptr) :: v1_d, v2_d, v3_d
1244 type(c_ptr) :: w1_d, w2_d, w3_d
1245 integer :: n
1246 type(c_ptr), optional :: strm
1247 type(c_ptr) :: strm_
1248
1249 if (n .lt. 1) return
1250
1251 if (present(strm)) then
1252 strm_ = strm
1253 else
1254 strm_ = glb_cmd_queue
1255 end if
1256
1257#if HAVE_HIP
1258 call hip_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1259 w1_d, w2_d, w3_d, n, strm_)
1260#elif HAVE_CUDA
1261 call cuda_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1262 w1_d, w2_d, w3_d, n, strm_)
1263#elif HAVE_OPENCL
1264 call opencl_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1265 w1_d, w2_d, w3_d, n, strm_)
1266#elif HAVE_METAL
1267 call metal_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, &
1268 w1_d, w2_d, w3_d, n, strm_)
1269#else
1270 call neko_error('No device backend configured')
1271#endif
1272 end subroutine device_vcross
1273
1274
1276 function device_vlsc3(u_d, v_d, w_d, n, strm) result(res)
1277 type(c_ptr) :: u_d, v_d, w_d
1278 integer :: n
1279 type(c_ptr), optional :: strm
1280 type(c_ptr) :: strm_
1281 real(kind=rp) :: res
1282
1283 res = 0.0_rp
1284
1285 if (n .lt. 1) return
1286
1287 if (present(strm)) then
1288 strm_ = strm
1289 else
1290 strm_ = glb_cmd_queue
1291 end if
1292
1293#if HAVE_HIP
1294 res = hip_vlsc3(u_d, v_d, w_d, n, strm_)
1295#elif HAVE_CUDA
1296 res = cuda_vlsc3(u_d, v_d, w_d, n, strm_)
1297#elif HAVE_OPENCL
1298 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1299 res = opencl_glsc3(u_d, v_d, w_d, n, strm_)
1300#elif HAVE_METAL
1301 ! Same kernel as glsc3 (currently no device MPI for OpenCL)
1302 res = metal_glsc3(u_d, v_d, w_d, n, strm_)
1303#else
1304 call neko_error('No device backend configured')
1305#endif
1306 end function device_vlsc3
1307
1309 function device_glsc3(a_d, b_d, c_d, n, strm) result(res)
1310 type(c_ptr) :: a_d, b_d, c_d
1311 integer :: n, ierr
1312 type(c_ptr), optional :: strm
1313 type(c_ptr) :: strm_
1314 real(kind=rp) :: res
1315
1316 if (present(strm)) then
1317 strm_ = strm
1318 else
1319 strm_ = glb_cmd_queue
1320 end if
1321
1322 res = 0.0_rp
1323#if HAVE_HIP
1324 res = hip_glsc3(a_d, b_d, c_d, n, strm_)
1325#elif HAVE_CUDA
1326 res = cuda_glsc3(a_d, b_d, c_d, n, strm_)
1327#elif HAVE_OPENCL
1328 res = opencl_glsc3(a_d, b_d, c_d, n, strm_)
1329#elif HAVE_METAL
1330 res = metal_glsc3(a_d, b_d, c_d, n, strm_)
1331#else
1332 call neko_error('No device backend configured')
1333#endif
1334
1335#ifndef HAVE_DEVICE_MPI
1336 if (pe_size .gt. 1) then
1337 call mpi_allreduce(mpi_in_place, res, 1, &
1338 mpi_real_precision, mpi_sum, neko_comm, ierr)
1339 end if
1340#endif
1341 end function device_glsc3
1342
1343 subroutine device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
1344 type(c_ptr), value :: w_d, v_d_d, mult_d
1345 integer(c_int) :: j, n
1346 real(c_rp) :: h(j)
1347 type(c_ptr), optional :: strm
1348 type(c_ptr) :: strm_
1349 integer :: ierr
1350
1351 if (present(strm)) then
1352 strm_ = strm
1353 else
1354 strm_ = glb_cmd_queue
1355 end if
1356
1357#if HAVE_HIP
1358 call hip_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1359#elif HAVE_CUDA
1360 call cuda_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1361#elif HAVE_OPENCL
1362 call opencl_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1363#elif HAVE_METAL
1364 call metal_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm_)
1365#else
1366 call neko_error('No device backend configured')
1367#endif
1368
1369#ifndef HAVE_DEVICE_MPI
1370 if (pe_size .gt. 1) then
1371 call mpi_allreduce(mpi_in_place, h, j, &
1372 mpi_real_precision, mpi_sum, neko_comm, ierr)
1373 end if
1374#endif
1375 end subroutine device_glsc3_many
1376
1377 subroutine device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
1378 type(c_ptr), value :: y_d, x_d_d, a_d
1379 integer(c_int) :: j, n
1380 type(c_ptr), optional :: strm
1381 type(c_ptr) :: strm_
1382
1383 if (n .lt. 1) return
1384
1385 if (present(strm)) then
1386 strm_ = strm
1387 else
1388 strm_ = glb_cmd_queue
1389 end if
1390
1391#if HAVE_HIP
1392 call hip_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1393#elif HAVE_CUDA
1394 call cuda_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1395#elif HAVE_OPENCL
1396 call opencl_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1397#elif HAVE_METAL
1398 call metal_add2s2_many(y_d, x_d_d, a_d, j, n, strm_)
1399#else
1400 call neko_error('No device backend configured')
1401#endif
1402 end subroutine device_add2s2_many
1403
1405 function device_glsc2(a_d, b_d, n, strm) result(res)
1406 type(c_ptr) :: a_d, b_d
1407 integer :: n, ierr
1408 real(kind=rp) :: res
1409 type(c_ptr), optional :: strm
1410 type(c_ptr) :: strm_
1411
1412 if (present(strm)) then
1413 strm_ = strm
1414 else
1415 strm_ = glb_cmd_queue
1416 end if
1417
1418 res = 0.0_rp
1419#if HAVE_HIP
1420 res = hip_glsc2(a_d, b_d, n, strm_)
1421#elif HAVE_CUDA
1422 res = cuda_glsc2(a_d, b_d, n, strm_)
1423#elif HAVE_OPENCL
1424 res = opencl_glsc2(a_d, b_d, n, strm_)
1425#elif HAVE_METAL
1426 res = metal_glsc2(a_d, b_d, n, strm_)
1427#else
1428 call neko_error('No device backend configured')
1429#endif
1430
1431#ifndef HAVE_DEVICE_MPI
1432 if (pe_size .gt. 1) then
1433 call mpi_allreduce(mpi_in_place, res, 1, &
1434 mpi_real_precision, mpi_sum, neko_comm, ierr)
1435 end if
1436#endif
1437 end function device_glsc2
1438
1441 function device_glsubnorm(a_d, b_d, n, strm) result(res)
1442 type(c_ptr), intent(in) :: a_d, b_d
1443 integer, intent(in) :: n
1444 integer :: ierr
1445 real(kind=rp) :: res
1446 type(c_ptr), optional :: strm
1447 type(c_ptr) :: strm_
1448
1449 if (present(strm)) then
1450 strm_ = strm
1451 else
1452 strm_ = glb_cmd_queue
1453 end if
1454
1455 res = 0.0_rp
1456#if HAVE_HIP
1457 res = hip_glsubnorm2(a_d, b_d, n, strm_)
1458#elif HAVE_CUDA
1459 res = cuda_glsubnorm2(a_d, b_d, n, strm_)
1460#elif HAVE_OPENCL
1461 res = opencl_glsubnorm2(a_d, b_d, n, strm_)
1462#elif HAVE_METAL
1463 res = metal_glsubnorm2(a_d, b_d, n, strm_)
1464#else
1465 call neko_error('No device backend configured')
1466#endif
1467
1468#ifndef HAVE_DEVICE_MPI
1469 if (pe_size .gt. 1) then
1470 call mpi_allreduce(mpi_in_place, res, 1, &
1471 mpi_real_precision, mpi_sum, neko_comm, ierr)
1472 end if
1473#endif
1474
1475 res = sqrt(res)
1476 end function device_glsubnorm
1477
1479 function device_glsum(a_d, n, strm) result(res)
1480 type(c_ptr) :: a_d
1481 integer :: n, ierr
1482 real(kind=rp) :: res
1483 type(c_ptr), optional :: strm
1484 type(c_ptr) :: strm_
1485
1486 if (present(strm)) then
1487 strm_ = strm
1488 else
1489 strm_ = glb_cmd_queue
1490 end if
1491
1492 res = 0.0_rp
1493#if HAVE_HIP
1494 res = hip_glsum(a_d, n, strm_)
1495#elif HAVE_CUDA
1496 res = cuda_glsum(a_d, n, strm_)
1497#elif HAVE_OPENCL
1498 res = opencl_glsum(a_d, n, strm_)
1499#elif HAVE_METAL
1500 res = metal_glsum(a_d, n, strm_)
1501#else
1502 call neko_error('No device backend configured')
1503#endif
1504
1505#ifndef HAVE_DEVICE_MPI
1506 if (pe_size .gt. 1) then
1507 call mpi_allreduce(mpi_in_place, res, 1, &
1508 mpi_real_precision, mpi_sum, neko_comm, ierr)
1509 end if
1510#endif
1511 end function device_glsum
1512
1514 function device_glmax(a_d, n, strm) result(res)
1515 type(c_ptr) :: a_d
1516 integer :: n, ierr
1517 real(kind=rp) :: res, ninf
1518 type(c_ptr), optional :: strm
1519 type(c_ptr) :: strm_
1520
1521 if (n .lt. 1) then
1522 res = -huge(0.0_rp)
1523 return
1524 end if
1525
1526 if (present(strm)) then
1527 strm_ = strm
1528 else
1529 strm_ = glb_cmd_queue
1530 end if
1531
1532 ninf = -huge(0.0_rp)
1533#if HAVE_HIP
1534 res = hip_glmax(a_d, ninf, n, strm_)
1535#elif HAVE_CUDA
1536 res = cuda_glmax(a_d, ninf, n, strm_)
1537#elif HAVE_OPENCL
1538 res = opencl_glmax(a_d, n, strm_)
1539#elif HAVE_METAL
1540 res = metal_glmax(a_d, ninf, n, strm_)
1541#else
1542 call neko_error('No device backend configured')
1543#endif
1544
1545#ifndef HAVE_DEVICE_MPI
1546 if (pe_size .gt. 1) then
1547 call mpi_allreduce(mpi_in_place, res, 1, &
1548 mpi_real_precision, mpi_max, neko_comm, ierr)
1549 end if
1550#endif
1551 end function device_glmax
1552
1554 function device_glmin(a_d, n, strm) result(res)
1555 type(c_ptr) :: a_d
1556 integer :: n, ierr
1557 real(kind=rp) :: res, pinf
1558 type(c_ptr), optional :: strm
1559 type(c_ptr) :: strm_
1560
1561 if (n .lt. 1) then
1562 res = huge(0.0_rp)
1563 return
1564 end if
1565
1566 if (present(strm)) then
1567 strm_ = strm
1568 else
1569 strm_ = glb_cmd_queue
1570 end if
1571
1572 pinf = huge(0.0_rp)
1573#if HAVE_HIP
1574 res = hip_glmin(a_d, pinf, n, strm_)
1575#elif HAVE_CUDA
1576 res = cuda_glmin(a_d, pinf, n, strm_)
1577#elif HAVE_OPENCL
1578 res = opencl_glmin(a_d, n, strm_)
1579#elif HAVE_METAL
1580 res = metal_glmin(a_d, pinf, n, strm_)
1581#else
1582 call neko_error('No device backend configured')
1583#endif
1584
1585#ifndef HAVE_DEVICE_MPI
1586 if (pe_size .gt. 1) then
1587 call mpi_allreduce(mpi_in_place, res, 1, &
1588 mpi_real_precision, mpi_min, neko_comm, ierr)
1589 end if
1590#endif
1591 end function device_glmin
1592
1593 subroutine device_absval(a_d, n, strm)
1594 integer, intent(in) :: n
1595 type(c_ptr) :: a_d
1596 type(c_ptr), optional :: strm
1597 type(c_ptr) :: strm_
1598
1599 if (n .lt. 1) return
1600
1601 if (present(strm)) then
1602 strm_ = strm
1603 else
1604 strm_ = glb_cmd_queue
1605 end if
1606
1607#ifdef HAVE_HIP
1608 call hip_absval(a_d, n, strm_)
1609#elif HAVE_CUDA
1610 call cuda_absval(a_d, n, strm_)
1611#elif HAVE_OPENCL
1612 call opencl_absval(a_d, n, strm_)
1613#elif HAVE_METAL
1614 call metal_absval(a_d, n, strm_)
1615#else
1616 call neko_error('No device backend configured')
1617#endif
1618
1619 end subroutine device_absval
1620
1621 ! ========================================================================== !
1622 ! Device point-wise max
1623
1626 subroutine device_pwmax2(a_d, b_d, n, strm)
1627 type(c_ptr) :: a_d, b_d
1628 integer :: n
1629 type(c_ptr), optional :: strm
1630 type(c_ptr) :: strm_
1631
1632 if (n .lt. 1) return
1633
1634 if (present(strm)) then
1635 strm_ = strm
1636 else
1637 strm_ = glb_cmd_queue
1638 end if
1639
1640#if HAVE_HIP
1641 call hip_pwmax_vec2(a_d, b_d, n, strm_)
1642#elif HAVE_CUDA
1643 call cuda_pwmax_vec2(a_d, b_d, n, strm_)
1644#elif HAVE_OPENCL
1645 call opencl_pwmax_vec2(a_d, b_d, n, strm_)
1646#elif HAVE_METAL
1647 call metal_pwmax_vec2(a_d, b_d, n, strm_)
1648#else
1649 call neko_error('No device backend configured')
1650#endif
1651 end subroutine device_pwmax2
1652
1655 subroutine device_pwmax3(a_d, b_d, c_d, n, strm)
1656 type(c_ptr) :: a_d, b_d, c_d
1657 integer :: n
1658 type(c_ptr), optional :: strm
1659 type(c_ptr) :: strm_
1660
1661 if (n .lt. 1) return
1662
1663 if (present(strm)) then
1664 strm_ = strm
1665 else
1666 strm_ = glb_cmd_queue
1667 end if
1668
1669#if HAVE_HIP
1670 call hip_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1671#elif HAVE_CUDA
1672 call cuda_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1673#elif HAVE_OPENCL
1674 call opencl_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1675#elif HAVE_METAL
1676 call metal_pwmax_vec3(a_d, b_d, c_d, n, strm_)
1677#else
1678 call neko_error('No device backend configured')
1679#endif
1680
1681 end subroutine device_pwmax3
1682
1685 subroutine device_cpwmax2(a_d, c, n, strm)
1686 type(c_ptr) :: a_d
1687 real(kind=rp), intent(in) :: c
1688 integer :: n
1689 type(c_ptr), optional :: strm
1690 type(c_ptr) :: strm_
1691
1692 if (n .lt. 1) return
1693
1694 if (present(strm)) then
1695 strm_ = strm
1696 else
1697 strm_ = glb_cmd_queue
1698 end if
1699
1700#if HAVE_HIP
1701 call hip_pwmax_sca2(a_d, c, n, strm_)
1702#elif HAVE_CUDA
1703 call cuda_pwmax_sca2(a_d, c, n, strm_)
1704#elif HAVE_OPENCL
1705 call opencl_pwmax_sca2(a_d, c, n, strm_)
1706#elif HAVE_METAL
1707 call metal_pwmax_sca2(a_d, c, n, strm_)
1708#else
1709 call neko_error('No device backend configured')
1710#endif
1711
1712 end subroutine device_cpwmax2
1713
1716 subroutine device_cpwmax3(a_d, b_d, c, n, strm)
1717 type(c_ptr) :: a_d, b_d
1718 real(kind=rp), intent(in) :: c
1719 integer :: n
1720 type(c_ptr), optional :: strm
1721 type(c_ptr) :: strm_
1722
1723 if (n .lt. 1) return
1724
1725 if (present(strm)) then
1726 strm_ = strm
1727 else
1728 strm_ = glb_cmd_queue
1729 end if
1730
1731#if HAVE_HIP
1732 call hip_pwmax_sca3(a_d, b_d, c, n, strm_)
1733#elif HAVE_CUDA
1734 call cuda_pwmax_sca3(a_d, b_d, c, n, strm_)
1735#elif HAVE_OPENCL
1736 call opencl_pwmax_sca3(a_d, b_d, c, n, strm_)
1737#elif HAVE_METAL
1738 call metal_pwmax_sca3(a_d, b_d, c, n, strm_)
1739#else
1740 call neko_error('No device backend configured')
1741#endif
1742
1743 end subroutine device_cpwmax3
1744
1745 ! ========================================================================== !
1746 ! Device point-wise min
1747
1750 subroutine device_pwmin2(a_d, b_d, n, strm)
1751 type(c_ptr) :: a_d, b_d
1752 integer :: n
1753 type(c_ptr), optional :: strm
1754 type(c_ptr) :: strm_
1755
1756 if (n .lt. 1) return
1757
1758 if (present(strm)) then
1759 strm_ = strm
1760 else
1761 strm_ = glb_cmd_queue
1762 end if
1763
1764#if HAVE_HIP
1765 call hip_pwmin_vec2(a_d, b_d, n, strm_)
1766#elif HAVE_CUDA
1767 call cuda_pwmin_vec2(a_d, b_d, n, strm_)
1768#elif HAVE_OPENCL
1769 call opencl_pwmin_vec2(a_d, b_d, n, strm_)
1770#elif HAVE_METAL
1771 call metal_pwmin_vec2(a_d, b_d, n, strm_)
1772#else
1773 call neko_error('No device backend configured')
1774#endif
1775 end subroutine device_pwmin2
1776
1779 subroutine device_pwmin3(a_d, b_d, c_d, n, strm)
1780 type(c_ptr) :: a_d, b_d, c_d
1781 integer :: n
1782 type(c_ptr), optional :: strm
1783 type(c_ptr) :: strm_
1784
1785 if (n .lt. 1) return
1786
1787 if (present(strm)) then
1788 strm_ = strm
1789 else
1790 strm_ = glb_cmd_queue
1791 end if
1792
1793#if HAVE_HIP
1794 call hip_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1795#elif HAVE_CUDA
1796 call cuda_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1797#elif HAVE_OPENCL
1798 call opencl_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1799#elif HAVE_METAL
1800 call metal_pwmin_vec3(a_d, b_d, c_d, n, strm_)
1801#else
1802 call neko_error('No device backend configured')
1803#endif
1804
1805 end subroutine device_pwmin3
1806
1809 subroutine device_cpwmin2(a_d, c, n, strm)
1810 type(c_ptr) :: a_d
1811 real(kind=rp), intent(in) :: c
1812 integer :: n
1813 type(c_ptr), optional :: strm
1814 type(c_ptr) :: strm_
1815
1816 if (n .lt. 1) return
1817
1818 if (present(strm)) then
1819 strm_ = strm
1820 else
1821 strm_ = glb_cmd_queue
1822 end if
1823
1824#if HAVE_HIP
1825 call hip_pwmin_sca2(a_d, c, n, strm_)
1826#elif HAVE_CUDA
1827 call cuda_pwmin_sca2(a_d, c, n, strm_)
1828#elif HAVE_OPENCL
1829 call opencl_pwmin_sca2(a_d, c, n, strm_)
1830#elif HAVE_METAL
1831 call metal_pwmin_sca2(a_d, c, n, strm_)
1832#else
1833 call neko_error('No device backend configured')
1834#endif
1835
1836 end subroutine device_cpwmin2
1837
1840 subroutine device_cpwmin3(a_d, b_d, c, n, strm)
1841 type(c_ptr) :: a_d, b_d
1842 real(kind=rp), intent(in) :: c
1843 integer :: n
1844 type(c_ptr), optional :: strm
1845 type(c_ptr) :: strm_
1846
1847 if (n .lt. 1) return
1848
1849 if (present(strm)) then
1850 strm_ = strm
1851 else
1852 strm_ = glb_cmd_queue
1853 end if
1854
1855#if HAVE_HIP
1856 call hip_pwmin_sca3(a_d, b_d, c, n, strm_)
1857#elif HAVE_CUDA
1858 call cuda_pwmin_sca3(a_d, b_d, c, n, strm_)
1859#elif HAVE_OPENCL
1860 call opencl_pwmin_sca3(a_d, b_d, c, n, strm_)
1861#elif HAVE_METAL
1862 call metal_pwmin_sca3(a_d, b_d, c, n, strm_)
1863#else
1864 call neko_error('No device backend configured')
1865#endif
1866
1867 end subroutine device_cpwmin3
1868
1869 ! ========================================================================== !
1870 ! Integer operations
1871
1873 subroutine device_iadd(a_d, c, n, strm)
1874 type(c_ptr), intent(inout) :: a_d
1875 integer, intent(in) :: c
1876 integer, intent(in) :: n
1877 type(c_ptr), optional :: strm
1878 type(c_ptr) :: strm_
1879 if (n .lt. 1) return
1880
1881 if (present(strm)) then
1882 strm_ = strm
1883 else
1884 strm_ = glb_cmd_queue
1885 end if
1886
1887#if HAVE_HIP
1888 call hip_iadd(a_d, c, n, strm_)
1889#elif HAVE_CUDA
1890 call cuda_iadd(a_d, c, n, strm_)
1891#elif HAVE_OPENCL
1892 call opencl_iadd(a_d, c, n, strm_)
1893#elif HAVE_METAL
1894 call metal_iadd(a_d, c, n, strm_)
1895#else
1896 call neko_error('No device backend configured')
1897#endif
1898 end subroutine device_iadd
1899
1900
1901end module device_math
Definition comm.F90:1
type(mpi_datatype), public mpi_real_precision
MPI type for working precision of REAL types.
Definition comm.F90:53
integer, public pe_size
MPI size of communicator.
Definition comm.F90:61
type(mpi_comm), public neko_comm
MPI communicator.
Definition comm.F90:45
subroutine, public device_pwmin2(a_d, b_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_masked_scatter_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s1(a_d, b_d, c1, n, strm)
subroutine, public device_add2s2_many(y_d, x_d_d, a_d, j, n, strm)
subroutine, public device_add4s3(a_d, b_d, c_d, d_d, c1, c2, c3, n, strm)
Returns .
subroutine, public device_sub3(a_d, b_d, c_d, n, strm)
Vector subtraction .
subroutine, public device_masked_scatter_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Scatter a masked vector .
subroutine, public device_add2s2(a_d, b_d, c1, n, strm)
Vector addition with scalar multiplication (multiplication on first argument)
real(kind=rp) function, public device_glmax(a_d, n, strm)
Max of a vector of length n.
subroutine, public device_add2(a_d, b_d, n, strm)
Vector addition .
subroutine, public device_addcol3(a_d, b_d, c_d, n, strm)
Returns .
real(kind=rp) function, public device_glsum(a_d, n, strm)
Sum a vector of length n.
subroutine, public device_pwmax3(a_d, b_d, c_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine, public device_invcol1(a_d, n, strm)
Invert a vector .
subroutine, public device_add3s2(a_d, b_d, c_d, c1, c2, n, strm)
Returns .
subroutine, public device_masked_atomic_reduction_0(a_d, b_d, mask_d, n, n_mask, strm)
subroutine, public device_cpwmax3(a_d, b_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_rzero(a_d, n, strm)
Zero a real vector.
subroutine, public device_rone(a_d, n, strm)
Set all elements to one.
subroutine, public device_vcross(u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, w1_d, w2_d, w3_d, n, strm)
Compute a cross product (3-d version) assuming vector components etc.
subroutine, public device_cmult(a_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_vdot3(dot_d, u1_d, u2_d, u3_d, v1_d, v2_d, v3_d, n, strm)
Compute a dot product (3-d version) assuming vector components etc.
real(kind=rp) function, public device_glsubnorm(a_d, b_d, n, strm)
Returns the norm of the difference of two vectors .
subroutine, public device_glsc3_many(h, w_d, v_d_d, mult_d, j, n, strm)
subroutine, public device_cfill_mask(a_d, c, n, mask_d, n_mask, strm)
Fill a constant to a masked vector. .
subroutine, public device_cadd2(a_d, b_d, c, n, strm)
Add a scalar to vector .
subroutine, public device_sub2(a_d, b_d, n, strm)
Vector substraction .
subroutine, public device_copy(a_d, b_d, n, strm)
Copy a vector .
subroutine, public device_invcol3(a_d, b_d, c_d, n, strm)
Vector division .
subroutine, public device_pwmin3(a_d, b_d, c_d, n, strm)
Compute the point-wise minimum of two vectors .
subroutine, public device_col2(a_d, b_d, n, strm)
Vector multiplication .
subroutine, public device_masked_gather_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
real(kind=rp) function, public device_vlsc3(u_d, v_d, w_d, n, strm)
Compute multiplication sum .
subroutine, public device_cdiv2(a_d, b_d, c, n, strm)
Division of constant c by array .
subroutine, public device_add5s4(a_d, b_d, c_d, d_d, e_d, c1, c2, c3, c4, n, strm)
Returns .
subroutine, public device_add4(a_d, b_d, c_d, d_d, n, strm)
subroutine, public device_cpwmax2(a_d, c, n, strm)
Compute the point-wise maximum of a vector and a scalar .
subroutine, public device_masked_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_subcol3(a_d, b_d, c_d, n, strm)
Returns .
subroutine, public device_cdiv(a_d, c, n, strm)
Division of constant c by array .
subroutine, public device_absval(a_d, n, strm)
subroutine device_iadd(a_d, c, n, strm)
Add an integer scalar to vector .
subroutine, public device_masked_gather_copy_0(a_d, b_d, mask_d, n, n_mask, strm)
Gather a masked vector .
subroutine, public device_invcol2(a_d, b_d, n, strm)
Vector division .
subroutine, public device_addsqr2s2(a_d, b_d, c1, n, strm)
Returns .
real(kind=rp) function, public device_glsc3(a_d, b_d, c_d, n, strm)
Weighted inner product .
subroutine, public device_cpwmin3(a_d, b_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_cwrap(a_d, min_val, max_val, n, strm)
Wrap value around a range (min, max)
subroutine, public device_face_masked_gather_copy_0(a_d, b_d, mask_d, facet_d, n1, n2, lx, ly, lz, n_mask, strm)
Gather a face-local SEM field .
real(kind=rp) function, public device_glsc2(a_d, b_d, n, strm)
Weighted inner product .
subroutine, public device_cmult2(a_d, b_d, c, n, strm)
Multiplication by constant c .
subroutine, public device_masked_copy_aligned(a_d, b_d, mask_d, n, n_mask, strm)
Copy a masked vector .
subroutine, public device_col3(a_d, b_d, c_d, n, strm)
Vector multiplication with 3 vectors .
subroutine, public device_addcol4(a_d, b_d, c_d, d_d, n, strm)
Returns .
subroutine, public device_cfill(a_d, c, n, strm)
Set all elements to a constant c .
subroutine, public device_cpwmin2(a_d, c, n, strm)
Compute the point-wise minimum of a vector and a scalar .
subroutine, public device_add3(a_d, b_d, c_d, n, strm)
Vector addition .
real(kind=rp) function, public device_glmin(a_d, n, strm)
Min of a vector of length n.
subroutine, public device_addcol3s2(a_d, b_d, c_d, s, n, strm)
Returns .
subroutine, public device_pwmax2(a_d, b_d, n, strm)
Compute the point-wise maximum of two vectors .
subroutine device_radd(a_d, c, n, strm)
Add a scalar to vector .
Device abstraction, common interface for various accelerators.
Definition device.F90:34
type(c_ptr), bind(C), public glb_cmd_queue
Global command queue.
Definition device.F90:52
integer, parameter, public c_rp
Definition num_types.f90:13
integer, parameter, public rp
Global precision used in computations.
Definition num_types.f90:12
Utilities.
Definition utils.f90:35