Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[mask[i+1]-1] = b[i];
87 }
88}
89
90
94template< typename T >
96 T * __restrict__ b,
97 int * __restrict__ mask,
98 const int n,
99 const int m) {
100
101 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
102 const int str = blockDim.x * gridDim.x;
103
104 for (int i = idx; i < m; i += str) {
105#if __CUDA_ARCH__ >= 600
106 atomicAdd( &(a[mask[i+1]-1]), b[i]);
107#endif
108 }
109}
110
114template< typename T >
116 T * __restrict__ b,
117 int * __restrict__ mask,
118 const int n,
119 const int n_mask) {
120
121 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
122 const int str = blockDim.x * gridDim.x;
123
124 for (int i = idx; i < n_mask; i += str) {
125 a[mask[i+1]-1] = b[mask[i+1]-1];
126 }
127}
128
132template <typename T>
134 const T c,
135 const int size,
136 int* __restrict__ mask,
137 const int mask_size) {
138
139 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
140 const int str = blockDim.x * gridDim.x;
141
142 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
143}
144
148template< typename T >
150 T * __restrict__ b,
151 const T c,
152 const int n) {
153
154 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
155 const int str = blockDim.x * gridDim.x;
156
157 for (int i = idx; i < n; i += str) {
158 a[i] = c * b[i];
159 }
160}
161
165template< typename T >
167 const T c,
168 const int n) {
169
170 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
171 const int str = blockDim.x * gridDim.x;
172
173 for (int i = idx; i < n; i += str) {
174 a[i] = c / a[i];
175 }
176}
177
181template< typename T >
183 T * __restrict__ b,
184 const T c,
185 const int n) {
186
187 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
188 const int str = blockDim.x * gridDim.x;
189
190 for (int i = idx; i < n; i += str) {
191 a[i] = c / b[i];
192 }
193}
194
198template< typename T >
200 const T c,
201 const int n) {
202
203 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
204 const int str = blockDim.x * gridDim.x;
205
206 for (int i = idx; i < n; i += str) {
207 a[i] = a[i] + c;
208 }
209}
210
214template< typename T >
216 T * __restrict__ b,
217 const T c,
218 const int n) {
219
220 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
221 const int str = blockDim.x * gridDim.x;
222
223 for (int i = idx; i < n; i += str) {
224 a[i] = b[i] + c;
225 }
226}
227
231template< typename T >
233 const T c,
234 const int n) {
235
236 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
237 const int str = blockDim.x * gridDim.x;
238
239 for (int i = idx; i < n; i += str) {
240 a[i] = c;
241 }
242}
243
247template< typename T >
249 const T * __restrict__ b,
250 const int n) {
251
252 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
253 const int str = blockDim.x * gridDim.x;
254
255 for (int i = idx; i < n; i += str) {
256 a[i] = a[i] + b[i];
257 }
258}
259
263template< typename T >
265 const T * __restrict__ b,
266 const T * __restrict__ c,
267 const int n) {
268
269 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
270 const int str = blockDim.x * gridDim.x;
271
272 for (int i = idx; i < n; i += str) {
273 a[i] = b[i] + c[i];
274 }
275}
276
280template< typename T >
282 const T * __restrict__ b,
283 const T * __restrict__ c,
284 const T * __restrict__ d,
285 const int n) {
286
287 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
288 const int str = blockDim.x * gridDim.x;
289
290 for (int i = idx; i < n; i += str) {
291 a[i] = b[i] + c[i] + d[i];
292 }
293}
294
298template< typename T >
300 const T * __restrict__ b,
301 const T c1,
302 const int n) {
303
304 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
305 const int str = blockDim.x * gridDim.x;
306
307 for (int i = idx; i < n; i += str) {
308 a[i] = c1 * a[i] + b[i];
309 }
310}
311
315template< typename T >
317 const T ** p,
318 const T * alpha,
319 const int p_cur,
320 const int n) {
321
322 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
323 const int str = blockDim.x * gridDim.x;
324
325
326 for (int i = idx; i < n; i+= str) {
327 T tmp = 0.0;
328 for (int j = 0; j < p_cur; j ++) {
329 tmp += p[j][i]*alpha[j];
330 }
331 x[i] += tmp;
332 }
333}
334
338template< typename T >
340 const T * __restrict__ b,
341 const T c1,
342 const int n) {
343
344 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
345 const int str = blockDim.x * gridDim.x;
346
347 for (int i = idx; i < n; i += str) {
348 a[i] = a[i] + c1 * b[i];
349 }
350}
351
355template< typename T >
357 const T * __restrict__ b,
358 const T c1,
359 const int n) {
360
361 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
362 const int str = blockDim.x * gridDim.x;
363
364 for (int i = idx; i < n; i += str) {
365 a[i] = a[i] + c1 * (b[i] * b[i]);
366 }
367}
368
372template< typename T >
374 const T * __restrict__ b,
375 const T * __restrict__ c,
376 const T c1,
377 const T c2,
378 const int n) {
379
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
382
383 for (int i = idx; i < n; i += str) {
384 a[i] = c1 * b[i] + c2 * c[i];
385 }
386}
387
391template< typename T >
393 const int n) {
394
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
397 const T one = 1.0;
398
399 for (int i = idx; i < n; i += str) {
400 a[i] = one / a[i];
401 }
402}
403
407template< typename T >
409 const T * __restrict__ b,
410 const int n) {
411
412 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
413 const int str = blockDim.x * gridDim.x;
414
415 for (int i = idx; i < n; i += str) {
416 a[i] = a[i] / b[i];
417 }
418}
419
423template< typename T >
425 const T * __restrict__ b,
426 const T * __restrict__ c,
427 const int n) {
428
429 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
430 const int str = blockDim.x * gridDim.x;
431
432 for (int i = idx; i < n; i += str) {
433 a[i] = b[i] / c[i];
434 }
435}
436
440template< typename T >
442 const T * __restrict__ b,
443 const int n) {
444
445 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
446 const int str = blockDim.x * gridDim.x;
447
448 for (int i = idx; i < n; i += str) {
449 a[i] = a[i] * b[i];
450 }
451}
452
456template< typename T >
458 const T * __restrict__ b,
459 const T * __restrict__ c,
460 const int n) {
461
462 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
463 const int str = blockDim.x * gridDim.x;
464
465 for (int i = idx; i < n; i += str) {
466 a[i] = b[i] * c[i];
467 }
468}
469
473template< typename T >
475 const T * __restrict__ b,
476 const T * __restrict__ c,
477 const int n) {
478
479 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
480 const int str = blockDim.x * gridDim.x;
481
482 for (int i = idx; i < n; i += str) {
483 a[i] = a[i] - b[i] * c[i];
484 }
485}
486
490template< typename T >
492 const T * __restrict__ b,
493 const int n) {
494
495 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
496 const int str = blockDim.x * gridDim.x;
497
498 for (int i = idx; i < n; i += str) {
499 a[i] = a[i] - b[i];
500 }
501}
502
506template< typename T >
508 const T * __restrict__ b,
509 const T * __restrict__ c,
510 const int n) {
511
512 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
513 const int str = blockDim.x * gridDim.x;
514
515 for (int i = idx; i < n; i += str) {
516 a[i] = b[i] - c[i];
517 }
518}
519
523template< typename T >
525 const T * __restrict__ b,
526 const T * __restrict__ c,
527 const int n) {
528
529 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
530 const int str = blockDim.x * gridDim.x;
531
532 for (int i = idx; i < n; i += str) {
533 a[i] = a[i] + b[i] * c[i];
534 }
535
536}
537
541template< typename T >
543 const T * __restrict__ b,
544 const T * __restrict__ c,
545 const T * __restrict__ d,
546 const int n) {
547
548 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
549 const int str = blockDim.x * gridDim.x;
550
551 for (int i = idx; i < n; i += str) {
552 a[i] = a[i] + b[i] * c[i] * d[i];
553 }
554
555}
556
560template< typename T >
562 const T * __restrict__ u1,
563 const T * __restrict__ u2,
564 const T * __restrict__ u3,
565 const T * __restrict__ v1,
566 const T * __restrict__ v2,
567 const T * __restrict__ v3,
568 const int n) {
569
570 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
571 const int str = blockDim.x * gridDim.x;
572
573 for (int i = idx; i < n; i += str) {
574 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
575 }
576
577}
578
582template< typename T >
584 T * __restrict__ u2,
585 T * __restrict__ u3,
586 const T * __restrict__ v1,
587 const T * __restrict__ v2,
588 const T * __restrict__ v3,
589 const T * __restrict__ w1,
590 const T * __restrict__ w2,
591 const T * __restrict__ w3,
592 const int n) {
593
594 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
595 const int str = blockDim.x * gridDim.x;
596
597 for (int i = idx; i < n; i += str) {
598 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
599 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
600 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
601 }
602
603}
604
605
609template< typename T>
611 val += __shfl_down_sync(0xffffffff, val, 16);
612 val += __shfl_down_sync(0xffffffff, val, 8);
613 val += __shfl_down_sync(0xffffffff, val, 4);
614 val += __shfl_down_sync(0xffffffff, val, 2);
615 val += __shfl_down_sync(0xffffffff, val, 1);
616 return val;
617}
618
622template< typename T >
623__global__ void reduce_kernel(T * bufred, const int n) {
624
625 T sum = 0;
626 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
627 const int str = blockDim.x * gridDim.x;
628 for (int i = idx; i<n ; i += str)
629 {
630 sum += bufred[i];
631 }
632
633 __shared__ T shared[32];
634 unsigned int lane = threadIdx.x % warpSize;
635 unsigned int wid = threadIdx.x / warpSize;
636
638 if (lane == 0)
639 shared[wid] = sum;
641
642 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
643 if (wid == 0)
645
646 if (threadIdx.x == 0)
647 bufred[blockIdx.x] = sum;
648}
649
650
655template< typename T >
657 const int n,
658 const int j
659 ) {
660 __shared__ T buf[1024] ;
661 const int idx = threadIdx.x;
662 const int y= blockIdx.x;
663 const int step = blockDim.x;
664
665 buf[idx]=0;
666 for (int i=idx ; i<n ; i+=step)
667 {
668 buf[idx] += bufred[i*j + y];
669 }
671
672 int i = 512;
673 while (i != 0)
674 {
675 if(threadIdx.x < i && (threadIdx.x + i) < n )
676 {
677 buf[threadIdx.x] += buf[threadIdx.x + i] ;
678 }
679 i = i>>1;
681 }
682
683 bufred[y] = buf[0];
684}
685
686
690template< typename T >
692 const T * b,
693 const T * c,
694 T * buf_h,
695 const int n) {
696
697 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
698 const int str = blockDim.x * gridDim.x;
699
700 const unsigned int lane = threadIdx.x % warpSize;
701 const unsigned int wid = threadIdx.x / warpSize;
702
703 __shared__ T shared[32];
704 T sum = 0.0;
705 for (int i = idx; i < n; i+= str) {
706 sum += a[i] * b[i] * c[i];
707 }
708
710 if (lane == 0)
711 shared[wid] = sum;
713
714 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
715 if (wid == 0)
717
718 if (threadIdx.x == 0)
719 buf_h[blockIdx.x] = sum;
720}
721
725template< typename T >
727 const T ** b,
728 const T * c,
729 T * buf_h,
730 const int j,
731 const int n) {
732
733 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
734 const int str = blockDim.y * gridDim.x;
735 const int y = threadIdx.x;
736
737 __shared__ T buf[1024];
738 T tmp = 0;
739 if(y < j){
740 for (int i = idx; i < n; i+= str) {
741 tmp += a[i] * b[threadIdx.x][i] * c[i];
742 }
743 }
744
745 buf[threadIdx.y*blockDim.x+y] = tmp;
747
748 int i = blockDim.y>>1;
749 while (i != 0) {
750 if (threadIdx.y < i) {
751 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
752 }
754 i = i>>1;
755 }
756 if (threadIdx.y == 0) {
757 if( y < j) {
758 buf_h[j*blockIdx.x+y] = buf[y];
759 }
760 }
761}
762
766template< typename T >
768 const T * b,
769 T * buf_h,
770 const int n) {
771
772 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
773 const int str = blockDim.x * gridDim.x;
774
775 const unsigned int lane = threadIdx.x % warpSize;
776 const unsigned int wid = threadIdx.x / warpSize;
777
778 __shared__ T shared[32];
779 T sum = 0.0;
780 for (int i = idx; i < n; i+= str) {
781 sum += a[i] * b[i];
782 }
783
785 if (lane == 0)
786 shared[wid] = sum;
788
789 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
790 if (wid == 0)
792
793 if (threadIdx.x == 0)
794 buf_h[blockIdx.x] = sum;
795
796}
797
801template< typename T >
803 const T * b,
804 T * buf_h,
805 const int n) {
806
807 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
808 const int str = blockDim.x * gridDim.x;
809
810 const unsigned int lane = threadIdx.x % warpSize;
811 const unsigned int wid = threadIdx.x / warpSize;
812
813 __shared__ T shared[32];
814 T sum = 0.0;
815 for (int i = idx; i < n; i+= str) {
816 sum += pow(a[i] - b[i], 2.0);
817 }
818
820 if (lane == 0)
821 shared[wid] = sum;
823
824 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
825 if (wid == 0)
827
828 if (threadIdx.x == 0)
829 buf_h[blockIdx.x] = sum;
830
831}
832
836template< typename T >
838 T * buf_h,
839 const int n) {
840
841 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
842 const int str = blockDim.x * gridDim.x;
843
844 const unsigned int lane = threadIdx.x % warpSize;
845 const unsigned int wid = threadIdx.x / warpSize;
846
847 __shared__ T shared[32];
848 T sum = 0;
849 for (int i = idx; i<n ; i += str)
850 {
851 sum += a[i];
852 }
853
855 if (lane == 0)
856 shared[wid] = sum;
858
859 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
860 if (wid == 0)
862
863 if (threadIdx.x == 0)
864 buf_h[blockIdx.x] = sum;
865
866}
867
871template< typename T >
873 const int n) {
874
875 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
876 const int str = blockDim.x * gridDim.x;
877
878 for (int i = idx; i < n; i += str) {
879 a[i] = fabs(a[i]);
880 }
881}
882
883// ========================================================================== //
884// Kernels for the point-wise operations
885
890template <typename T>
891__global__ void
892 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
893
894 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
895 const int str = blockDim.x * gridDim.x;
896
897 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
898}
899
904template <typename T>
906 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
907 const int n) {
908
909 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
910 const int str = blockDim.x * gridDim.x;
911
912 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
913}
914
919template <typename T>
920__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
921
922 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
923 const int str = blockDim.x * gridDim.x;
924
925 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
926}
927
932template <typename T>
934 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
935
936 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
937 const int str = blockDim.x * gridDim.x;
938
939 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
940}
941
946template <typename T>
947__global__ void
948 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
949
950 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
951 const int str = blockDim.x * gridDim.x;
952
953 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
954}
955
960template <typename T>
962 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
963 const int n) {
964
965 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
966 const int str = blockDim.x * gridDim.x;
967
968 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
969}
970
975template <typename T>
976__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
977
978 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
979 const int str = blockDim.x * gridDim.x;
980
981 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
982}
983
988template <typename T>
990 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
991
992 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
993 const int str = blockDim.x * gridDim.x;
994
995 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
996}
997
998#endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:95
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:549
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40