Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[i] = b[mask[i]];
87 }
88}
89
90
94template< typename T >
96 T * __restrict__ b,
97 int * __restrict__ mask,
98 const int n,
99 const int n_mask) {
100
101 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
102 const int str = blockDim.x * gridDim.x;
103
104 for (int i = idx; i < n_mask; i += str) {
105 a[mask[i+1]-1] = b[i];
106 }
107}
108
109
113template< typename T >
115 T * __restrict__ b,
116 int * __restrict__ mask,
117 const int n,
118 const int m) {
119
120 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
121 const int str = blockDim.x * gridDim.x;
122
123 for (int i = idx; i < m; i += str) {
124#if __CUDA_ARCH__ >= 600
125 atomicAdd( &(a[mask[i+1]-1]), b[i]);
126#endif
127 }
128}
129
133template< typename T >
135 T * __restrict__ b,
136 int * __restrict__ mask,
137 const int n,
138 const int n_mask) {
139
140 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
141 const int str = blockDim.x * gridDim.x;
142
143 for (int i = idx; i < n_mask; i += str) {
144 a[mask[i+1]-1] = b[mask[i+1]-1];
145 }
146}
147
151template <typename T>
153 const T c,
154 const int size,
155 int* __restrict__ mask,
156 const int mask_size) {
157
158 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
159 const int str = blockDim.x * gridDim.x;
160
161 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
162}
163
167template< typename T >
169 T * __restrict__ b,
170 const T c,
171 const int n) {
172
173 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
174 const int str = blockDim.x * gridDim.x;
175
176 for (int i = idx; i < n; i += str) {
177 a[i] = c * b[i];
178 }
179}
180
184template< typename T >
186 const T c,
187 const int n) {
188
189 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
190 const int str = blockDim.x * gridDim.x;
191
192 for (int i = idx; i < n; i += str) {
193 a[i] = c / a[i];
194 }
195}
196
200template< typename T >
202 T * __restrict__ b,
203 const T c,
204 const int n) {
205
206 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
207 const int str = blockDim.x * gridDim.x;
208
209 for (int i = idx; i < n; i += str) {
210 a[i] = c / b[i];
211 }
212}
213
217template< typename T >
219 const T c,
220 const int n) {
221
222 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
223 const int str = blockDim.x * gridDim.x;
224
225 for (int i = idx; i < n; i += str) {
226 a[i] = a[i] + c;
227 }
228}
229
233template< typename T >
235 T * __restrict__ b,
236 const T c,
237 const int n) {
238
239 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
240 const int str = blockDim.x * gridDim.x;
241
242 for (int i = idx; i < n; i += str) {
243 a[i] = b[i] + c;
244 }
245}
246
250template< typename T >
252 const T c,
253 const int n) {
254
255 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
256 const int str = blockDim.x * gridDim.x;
257
258 for (int i = idx; i < n; i += str) {
259 a[i] = c;
260 }
261}
262
266template< typename T >
268 const T * __restrict__ b,
269 const int n) {
270
271 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272 const int str = blockDim.x * gridDim.x;
273
274 for (int i = idx; i < n; i += str) {
275 a[i] = a[i] + b[i];
276 }
277}
278
282template< typename T >
284 const T * __restrict__ b,
285 const T * __restrict__ c,
286 const int n) {
287
288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
289 const int str = blockDim.x * gridDim.x;
290
291 for (int i = idx; i < n; i += str) {
292 a[i] = b[i] + c[i];
293 }
294}
295
299template< typename T >
301 const T * __restrict__ b,
302 const T * __restrict__ c,
303 const T * __restrict__ d,
304 const int n) {
305
306 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
307 const int str = blockDim.x * gridDim.x;
308
309 for (int i = idx; i < n; i += str) {
310 a[i] = b[i] + c[i] + d[i];
311 }
312}
313
317template< typename T >
319 const T * __restrict__ b,
320 const T c1,
321 const int n) {
322
323 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 const int str = blockDim.x * gridDim.x;
325
326 for (int i = idx; i < n; i += str) {
327 a[i] = c1 * a[i] + b[i];
328 }
329}
330
334template< typename T >
336 const T ** p,
337 const T * alpha,
338 const int p_cur,
339 const int n) {
340
341 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
342 const int str = blockDim.x * gridDim.x;
343
344
345 for (int i = idx; i < n; i+= str) {
346 T tmp = 0.0;
347 for (int j = 0; j < p_cur; j ++) {
348 tmp += p[j][i]*alpha[j];
349 }
350 x[i] += tmp;
351 }
352}
353
357template< typename T >
359 const T * __restrict__ b,
360 const T c1,
361 const int n) {
362
363 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
364 const int str = blockDim.x * gridDim.x;
365
366 for (int i = idx; i < n; i += str) {
367 a[i] = a[i] + c1 * b[i];
368 }
369}
370
374template< typename T >
376 const T * __restrict__ b,
377 const T c1,
378 const int n) {
379
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
382
383 for (int i = idx; i < n; i += str) {
384 a[i] = a[i] + c1 * (b[i] * b[i]);
385 }
386}
387
391template< typename T >
393 const T * __restrict__ b,
394 const T * __restrict__ c,
395 const T c1,
396 const T c2,
397 const int n) {
398
399 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
400 const int str = blockDim.x * gridDim.x;
401
402 for (int i = idx; i < n; i += str) {
403 a[i] = c1 * b[i] + c2 * c[i];
404 }
405}
406
410template< typename T >
412 const T * __restrict__ b,
413 const T * __restrict__ c,
414 const T * __restrict__ d,
415 const T c1,
416 const T c2,
417 const T c3,
418 const int n) {
419
420 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
421 const int str = blockDim.x * gridDim.x;
422
423 for (int i = idx; i < n; i += str) {
424 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
425 }
426}
427
431template< typename T >
433 const T * __restrict__ b,
434 const T * __restrict__ c,
435 const T * __restrict__ d,
436 const T * __restrict__ e,
437 const T c1,
438 const T c2,
439 const T c3,
440 const T c4,
441 const int n) {
442
443 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
444 const int str = blockDim.x * gridDim.x;
445
446 for (int i = idx; i < n; i += str) {
447 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
448 }
449}
450
454template< typename T >
456 const int n) {
457
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
460 const T one = 1.0;
461
462 for (int i = idx; i < n; i += str) {
463 a[i] = one / a[i];
464 }
465}
466
470template< typename T >
472 const T * __restrict__ b,
473 const int n) {
474
475 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
476 const int str = blockDim.x * gridDim.x;
477
478 for (int i = idx; i < n; i += str) {
479 a[i] = a[i] / b[i];
480 }
481}
482
486template< typename T >
488 const T * __restrict__ b,
489 const T * __restrict__ c,
490 const int n) {
491
492 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
493 const int str = blockDim.x * gridDim.x;
494
495 for (int i = idx; i < n; i += str) {
496 a[i] = b[i] / c[i];
497 }
498}
499
503template< typename T >
505 const T * __restrict__ b,
506 const int n) {
507
508 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
509 const int str = blockDim.x * gridDim.x;
510
511 for (int i = idx; i < n; i += str) {
512 a[i] = a[i] * b[i];
513 }
514}
515
519template< typename T >
521 const T * __restrict__ b,
522 const T * __restrict__ c,
523 const int n) {
524
525 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
526 const int str = blockDim.x * gridDim.x;
527
528 for (int i = idx; i < n; i += str) {
529 a[i] = b[i] * c[i];
530 }
531}
532
536template< typename T >
538 const T * __restrict__ b,
539 const T * __restrict__ c,
540 const int n) {
541
542 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
543 const int str = blockDim.x * gridDim.x;
544
545 for (int i = idx; i < n; i += str) {
546 a[i] = a[i] - b[i] * c[i];
547 }
548}
549
553template< typename T >
555 const T * __restrict__ b,
556 const int n) {
557
558 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
559 const int str = blockDim.x * gridDim.x;
560
561 for (int i = idx; i < n; i += str) {
562 a[i] = a[i] - b[i];
563 }
564}
565
569template< typename T >
571 const T * __restrict__ b,
572 const T * __restrict__ c,
573 const int n) {
574
575 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
576 const int str = blockDim.x * gridDim.x;
577
578 for (int i = idx; i < n; i += str) {
579 a[i] = b[i] - c[i];
580 }
581}
582
586template< typename T >
588 const T * __restrict__ b,
589 const T * __restrict__ c,
590 const int n) {
591
592 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
593 const int str = blockDim.x * gridDim.x;
594
595 for (int i = idx; i < n; i += str) {
596 a[i] = a[i] + b[i] * c[i];
597 }
598
599}
600
604template< typename T >
606 const T * __restrict__ b,
607 const T * __restrict__ c,
608 const T * __restrict__ d,
609 const int n) {
610
611 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
612 const int str = blockDim.x * gridDim.x;
613
614 for (int i = idx; i < n; i += str) {
615 a[i] = a[i] + b[i] * c[i] * d[i];
616 }
617
618}
619
623template< typename T >
625 const T * __restrict__ b,
626 const T * __restrict__ c,
627 const T s,
628 const int n) {
629
630 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
631 const int str = blockDim.x * gridDim.x;
632
633 for (int i = idx; i < n; i += str) {
634 a[i] = a[i] + s * b[i] * c[i];
635 }
636
637}
638
642template< typename T >
644 const T * __restrict__ u1,
645 const T * __restrict__ u2,
646 const T * __restrict__ u3,
647 const T * __restrict__ v1,
648 const T * __restrict__ v2,
649 const T * __restrict__ v3,
650 const int n) {
651
652 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
653 const int str = blockDim.x * gridDim.x;
654
655 for (int i = idx; i < n; i += str) {
656 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
657 }
658
659}
660
664template< typename T >
666 T * __restrict__ u2,
667 T * __restrict__ u3,
668 const T * __restrict__ v1,
669 const T * __restrict__ v2,
670 const T * __restrict__ v3,
671 const T * __restrict__ w1,
672 const T * __restrict__ w2,
673 const T * __restrict__ w3,
674 const int n) {
675
676 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
677 const int str = blockDim.x * gridDim.x;
678
679 for (int i = idx; i < n; i += str) {
680 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
681 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
682 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
683 }
684
685}
686
687
691template< typename T>
693 val += __shfl_down_sync(0xffffffff, val, 16);
694 val += __shfl_down_sync(0xffffffff, val, 8);
695 val += __shfl_down_sync(0xffffffff, val, 4);
696 val += __shfl_down_sync(0xffffffff, val, 2);
697 val += __shfl_down_sync(0xffffffff, val, 1);
698 return val;
699}
700
704template< typename T>
706 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
707 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
708 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
709 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
710 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
711 return val;
712}
713
717template< typename T>
719 val = min(val, __shfl_down_sync(0xffffffff, val, 16));
720 val = min(val, __shfl_down_sync(0xffffffff, val, 8));
721 val = min(val, __shfl_down_sync(0xffffffff, val, 4));
722 val = min(val, __shfl_down_sync(0xffffffff, val, 2));
723 val = min(val, __shfl_down_sync(0xffffffff, val, 1));
724 return val;
725}
726
730template< typename T >
731__global__ void reduce_kernel(T * bufred, const int n) {
732
733 T sum = 0;
734 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
735 const int str = blockDim.x * gridDim.x;
736 for (int i = idx; i<n ; i += str)
737 {
738 sum += bufred[i];
739 }
740
741 __shared__ T shared[32];
742 unsigned int lane = threadIdx.x % warpSize;
743 unsigned int wid = threadIdx.x / warpSize;
744
746 if (lane == 0)
747 shared[wid] = sum;
749
750 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
751 if (wid == 0)
753
754 if (threadIdx.x == 0)
755 bufred[blockIdx.x] = sum;
756}
757
761template< typename T >
762__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
763
764 T max_val = ninf;
765 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
766 const int str = blockDim.x * gridDim.x;
767 for (int i = idx; i<n ; i += str)
768 {
770 }
771
772 __shared__ T shared[32];
773 unsigned int lane = threadIdx.x % warpSize;
774 unsigned int wid = threadIdx.x / warpSize;
775
777 if (lane == 0)
778 shared[wid] = max_val;
780
782 if (wid == 0)
784
785 if (threadIdx.x == 0)
787}
788
792template< typename T >
793__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
794
795 T min_val = pinf;
796 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
797 const int str = blockDim.x * gridDim.x;
798 for (int i = idx; i<n ; i += str)
799 {
801 }
802
803 __shared__ T shared[32];
804 unsigned int lane = threadIdx.x % warpSize;
805 unsigned int wid = threadIdx.x / warpSize;
806
808 if (lane == 0)
809 shared[wid] = min_val;
811
813 if (wid == 0)
815
816 if (threadIdx.x == 0)
818}
819
824template< typename T >
826 const int n,
827 const int j
828 ) {
829 __shared__ T buf[1024] ;
830 const int idx = threadIdx.x;
831 const int y= blockIdx.x;
832 const int step = blockDim.x;
833
834 buf[idx]=0;
835 for (int i=idx ; i<n ; i+=step)
836 {
837 buf[idx] += bufred[i*j + y];
838 }
840
841 int i = 512;
842 while (i != 0)
843 {
844 if(threadIdx.x < i && (threadIdx.x + i) < n )
845 {
846 buf[threadIdx.x] += buf[threadIdx.x + i] ;
847 }
848 i = i>>1;
850 }
851
852 bufred[y] = buf[0];
853}
854
855
859template< typename T >
861 const T * b,
862 const T * c,
863 T * buf_h,
864 const int n) {
865
866 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
867 const int str = blockDim.x * gridDim.x;
868
869 const unsigned int lane = threadIdx.x % warpSize;
870 const unsigned int wid = threadIdx.x / warpSize;
871
872 __shared__ T shared[32];
873 T sum = 0.0;
874 for (int i = idx; i < n; i+= str) {
875 sum += a[i] * b[i] * c[i];
876 }
877
879 if (lane == 0)
880 shared[wid] = sum;
882
883 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
884 if (wid == 0)
886
887 if (threadIdx.x == 0)
888 buf_h[blockIdx.x] = sum;
889}
890
894template< typename T >
896 const T ** b,
897 const T * c,
898 T * buf_h,
899 const int j,
900 const int n) {
901
902 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
903 const int str = blockDim.y * gridDim.x;
904 const int y = threadIdx.x;
905
906 __shared__ T buf[1024];
907 T tmp = 0;
908 if(y < j){
909 for (int i = idx; i < n; i+= str) {
910 tmp += a[i] * b[threadIdx.x][i] * c[i];
911 }
912 }
913
914 buf[threadIdx.y*blockDim.x+y] = tmp;
916
917 int i = blockDim.y>>1;
918 while (i != 0) {
919 if (threadIdx.y < i) {
920 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
921 }
923 i = i>>1;
924 }
925 if (threadIdx.y == 0) {
926 if( y < j) {
927 buf_h[j*blockIdx.x+y] = buf[y];
928 }
929 }
930}
931
935template< typename T >
937 const T * b,
938 T * buf_h,
939 const int n) {
940
941 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
942 const int str = blockDim.x * gridDim.x;
943
944 const unsigned int lane = threadIdx.x % warpSize;
945 const unsigned int wid = threadIdx.x / warpSize;
946
947 __shared__ T shared[32];
948 T sum = 0.0;
949 for (int i = idx; i < n; i+= str) {
950 sum += a[i] * b[i];
951 }
952
954 if (lane == 0)
955 shared[wid] = sum;
957
958 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
959 if (wid == 0)
961
962 if (threadIdx.x == 0)
963 buf_h[blockIdx.x] = sum;
964
965}
966
970template< typename T >
972 const T * b,
973 T * buf_h,
974 const int n) {
975
976 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
977 const int str = blockDim.x * gridDim.x;
978
979 const unsigned int lane = threadIdx.x % warpSize;
980 const unsigned int wid = threadIdx.x / warpSize;
981
982 __shared__ T shared[32];
983 T sum = 0.0;
984 for (int i = idx; i < n; i+= str) {
985 sum += pow(a[i] - b[i], 2.0);
986 }
987
989 if (lane == 0)
990 shared[wid] = sum;
992
993 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
994 if (wid == 0)
996
997 if (threadIdx.x == 0)
998 buf_h[blockIdx.x] = sum;
999
1000}
1001
1005template< typename T >
1007 T * buf_h,
1008 const int n) {
1009
1010 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1011 const int str = blockDim.x * gridDim.x;
1012
1013 const unsigned int lane = threadIdx.x % warpSize;
1014 const unsigned int wid = threadIdx.x / warpSize;
1015
1016 __shared__ T shared[32];
1017 T sum = 0;
1018 for (int i = idx; i<n ; i += str)
1019 {
1020 sum += a[i];
1021 }
1022
1024 if (lane == 0)
1025 shared[wid] = sum;
1026 __syncthreads();
1027
1028 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1029 if (wid == 0)
1031
1032 if (threadIdx.x == 0)
1033 buf_h[blockIdx.x] = sum;
1034
1035}
1036
1040template< typename T >
1042 const T ninf,
1043 T * buf_h,
1044 const int n) {
1045
1046 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1047 const int str = blockDim.x * gridDim.x;
1048
1049 const unsigned int lane = threadIdx.x % warpSize;
1050 const unsigned int wid = threadIdx.x / warpSize;
1051
1052 __shared__ T shared[32];
1053 T max_val = ninf;
1054 for (int i = idx; i<n ; i += str)
1055 {
1056 max_val = max(max_val, a[i]);
1057 }
1058
1060 if (lane == 0)
1061 shared[wid] = max_val;
1062 __syncthreads();
1063
1065 if (wid == 0)
1067
1068 if (threadIdx.x == 0)
1069 buf_h[blockIdx.x] = max_val;
1070
1071}
1072
1076template< typename T >
1078 const T pinf,
1079 T * buf_h,
1080 const int n) {
1081
1082 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1083 const int str = blockDim.x * gridDim.x;
1084
1085 const unsigned int lane = threadIdx.x % warpSize;
1086 const unsigned int wid = threadIdx.x / warpSize;
1087
1088 __shared__ T shared[32];
1089 T min_val = pinf;
1090 for (int i = idx; i<n ; i += str)
1091 {
1092 min_val = min(min_val, a[i]);
1093 }
1094
1096 if (lane == 0)
1097 shared[wid] = min_val;
1098 __syncthreads();
1099
1101 if (wid == 0)
1103
1104 if (threadIdx.x == 0)
1105 buf_h[blockIdx.x] = min_val;
1106
1107}
1108
1112template< typename T >
1114 const int n) {
1115
1116 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1117 const int str = blockDim.x * gridDim.x;
1118
1119 for (int i = idx; i < n; i += str) {
1120 a[i] = fabs(a[i]);
1121 }
1122}
1123
1124// ========================================================================== //
1125// Kernels for the point-wise operations
1126
1131template <typename T>
1132__global__ void
1133 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1134
1135 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1136 const int str = blockDim.x * gridDim.x;
1137
1138 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1139}
1140
1145template <typename T>
1147 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1148 const int n) {
1149
1150 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1151 const int str = blockDim.x * gridDim.x;
1152
1153 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1154}
1155
1160template <typename T>
1161__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1162
1163 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1164 const int str = blockDim.x * gridDim.x;
1165
1166 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1167}
1168
1173template <typename T>
1175 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1176
1177 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1178 const int str = blockDim.x * gridDim.x;
1179
1180 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1181}
1182
1187template <typename T>
1188__global__ void
1189 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1190
1191 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1192 const int str = blockDim.x * gridDim.x;
1193
1194 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1195}
1196
1201template <typename T>
1203 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1204 const int n) {
1205
1206 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1207 const int str = blockDim.x * gridDim.x;
1208
1209 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1210}
1211
1216template <typename T>
1217__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1218
1219 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1220 const int str = blockDim.x * gridDim.x;
1221
1222 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1223}
1224
1229template <typename T>
1231 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1232
1233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1234 const int str = blockDim.x * gridDim.x;
1235
1236 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1237}
1238
1239#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:95
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:614
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40