Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[i] = b[mask[i]];
86 }
87}
88
92template< typename T >
94 T * __restrict__ b,
95 int * __restrict__ mask,
96 const int n,
97 const int n_mask) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < n_mask; i += str) {
103 a[mask[i+1]-1] = b[i];
104 }
105}
106
110template< typename T >
112 T * __restrict__ b,
113 int * __restrict__ mask,
114 const int n,
115 const int n_mask) {
116
117 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
118 const int str = blockDim.x * gridDim.x;
119
120 for (int i = idx; i < n_mask; i += str) {
121 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
122 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
123 }
124}
125
129template< typename T >
131 T * __restrict__ b,
132 int * __restrict__ mask,
133 const int n,
134 const int n_mask) {
135
136 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
137 const int str = blockDim.x * gridDim.x;
138
139 for (int i = idx; i < n_mask; i += str) {
140 a[mask[i+1]-1] = b[mask[i+1]-1];
141 }
142}
143
147template< typename T >
149 const T c,
150 const int n,
151 int* __restrict__ mask,
152 const int n_mask) {
153
154 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
155 const int str = blockDim.x * gridDim.x;
156
157 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
158}
159
163template< typename T >
165 T * __restrict__ b,
166 const T c,
167 const int n) {
168
169 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
170 const int str = blockDim.x * gridDim.x;
171
172 for (int i = idx; i < n; i += str) {
173 a[i] = c * b[i];
174 }
175}
176
180template< typename T >
182 const T c,
183 const int n) {
184
185 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
186 const int str = blockDim.x * gridDim.x;
187
188 for (int i = idx; i < n; i += str) {
189 a[i] = c / a[i];
190 }
191}
192
196template< typename T >
198 T * __restrict__ b,
199 const T c,
200 const int n) {
201
202 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
203 const int str = blockDim.x * gridDim.x;
204
205 for (int i = idx; i < n; i += str) {
206 a[i] = c / b[i];
207 }
208}
209
213template< typename T >
215 const T c,
216 const int n) {
217
218 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
219 const int str = blockDim.x * gridDim.x;
220
221 for (int i = idx; i < n; i += str) {
222 a[i] = a[i] + c;
223 }
224}
225
229template< typename T >
231 T * __restrict__ b,
232 const T c,
233 const int n) {
234
235 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
236 const int str = blockDim.x * gridDim.x;
237
238 for (int i = idx; i < n; i += str) {
239 a[i] = b[i] + c;
240 }
241}
242
246template< typename T >
248 const T c,
249 const int n) {
250
251 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
252 const int str = blockDim.x * gridDim.x;
253
254 for (int i = idx; i < n; i += str) {
255 a[i] = c;
256 }
257}
258
262template< typename T >
264 const T * __restrict__ b,
265 const int n) {
266
267 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
268 const int str = blockDim.x * gridDim.x;
269
270 for (int i = idx; i < n; i += str) {
271 a[i] = a[i] + b[i];
272 }
273}
274
278template< typename T >
280 const T * __restrict__ b,
281 const T * __restrict__ c,
282 const int n) {
283
284 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
285 const int str = blockDim.x * gridDim.x;
286
287 for (int i = idx; i < n; i += str) {
288 a[i] = b[i] + c[i];
289 }
290}
291
295template< typename T >
297 const T * __restrict__ b,
298 const T * __restrict__ c,
299 const T * __restrict__ d,
300 const int n) {
301
302 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
303 const int str = blockDim.x * gridDim.x;
304
305 for (int i = idx; i < n; i += str) {
306 a[i] = b[i] + c[i] + d[i];
307 }
308}
309
313template< typename T >
315 const T * __restrict__ b,
316 const T c1,
317 const int n) {
318
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
321
322 for (int i = idx; i < n; i += str) {
323 a[i] = c1 * a[i] + b[i];
324 }
325}
326
330template< typename T >
332 const T ** p,
333 const T * alpha,
334 const int p_cur,
335 const int n) {
336
337 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
338 const int str = blockDim.x * gridDim.x;
339
340
341 for (int i = idx; i < n; i+= str) {
342 T tmp = 0.0;
343 for (int j = 0; j < p_cur; j ++) {
344 tmp += p[j][i]*alpha[j];
345 }
346 x[i] += tmp;
347 }
348}
349
353template< typename T >
355 const T * __restrict__ b,
356 const T c1,
357 const int n) {
358
359 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
360 const int str = blockDim.x * gridDim.x;
361
362 for (int i = idx; i < n; i += str) {
363 a[i] = a[i] + c1 * b[i];
364 }
365}
366
370template< typename T >
372 const T * __restrict__ b,
373 const T c1,
374 const int n) {
375
376 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
377 const int str = blockDim.x * gridDim.x;
378
379 for (int i = idx; i < n; i += str) {
380 a[i] = a[i] + c1 * (b[i] * b[i]);
381 }
382}
383
387template< typename T >
389 const T * __restrict__ b,
390 const T * __restrict__ c,
391 const T c1,
392 const T c2,
393 const int n) {
394
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
397
398 for (int i = idx; i < n; i += str) {
399 a[i] = c1 * b[i] + c2 * c[i];
400 }
401}
402
406template< typename T >
408 const T * __restrict__ b,
409 const T * __restrict__ c,
410 const T * __restrict__ d,
411 const T c1,
412 const T c2,
413 const T c3,
414 const int n) {
415
416 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
417 const int str = blockDim.x * gridDim.x;
418
419 for (int i = idx; i < n; i += str) {
420 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
421 }
422}
423
427template< typename T >
429 const T * __restrict__ b,
430 const T * __restrict__ c,
431 const T * __restrict__ d,
432 const T * __restrict__ e,
433 const T c1,
434 const T c2,
435 const T c3,
436 const T c4,
437 const int n) {
438
439 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440 const int str = blockDim.x * gridDim.x;
441
442 for (int i = idx; i < n; i += str) {
443 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
444 }
445}
446
450template< typename T >
452 const int n) {
453
454 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
455 const int str = blockDim.x * gridDim.x;
456 const T one = 1.0;
457
458 for (int i = idx; i < n; i += str) {
459 a[i] = one / a[i];
460 }
461}
462
466template< typename T >
468 const T * __restrict__ b,
469 const int n) {
470
471 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
472 const int str = blockDim.x * gridDim.x;
473
474 for (int i = idx; i < n; i += str) {
475 a[i] = a[i] / b[i];
476 }
477}
478
482template< typename T >
484 const T * __restrict__ b,
485 const T * __restrict__ c,
486 const int n) {
487
488 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
489 const int str = blockDim.x * gridDim.x;
490
491 for (int i = idx; i < n; i += str) {
492 a[i] = b[i] / c[i];
493 }
494}
495
499template< typename T >
501 const T * __restrict__ b,
502 const int n) {
503
504 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
505 const int str = blockDim.x * gridDim.x;
506
507 for (int i = idx; i < n; i += str) {
508 a[i] = a[i] * b[i];
509 }
510}
511
515template< typename T >
517 const T * __restrict__ b,
518 const T * __restrict__ c,
519 const int n) {
520
521 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
522 const int str = blockDim.x * gridDim.x;
523
524 for (int i = idx; i < n; i += str) {
525 a[i] = b[i] * c[i];
526 }
527}
528
532template< typename T >
534 const T * __restrict__ b,
535 const T * __restrict__ c,
536 const int n) {
537
538 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
539 const int str = blockDim.x * gridDim.x;
540
541 for (int i = idx; i < n; i += str) {
542 a[i] = a[i] - b[i] * c[i];
543 }
544}
545
549template< typename T >
551 const T * __restrict__ b,
552 const int n) {
553
554 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
555 const int str = blockDim.x * gridDim.x;
556
557 for (int i = idx; i < n; i += str) {
558 a[i] = a[i] - b[i];
559 }
560}
561
565template< typename T >
567 const T * __restrict__ b,
568 const T * __restrict__ c,
569 const int n) {
570
571 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
572 const int str = blockDim.x * gridDim.x;
573
574 for (int i = idx; i < n; i += str) {
575 a[i] = b[i] - c[i];
576 }
577}
578
582template< typename T >
584 const T * __restrict__ b,
585 const T * __restrict__ c,
586 const int n) {
587
588 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
589 const int str = blockDim.x * gridDim.x;
590
591 for (int i = idx; i < n; i += str) {
592 a[i] = a[i] + b[i] * c[i];
593 }
594
595}
596
600template< typename T >
602 const T * __restrict__ b,
603 const T * __restrict__ c,
604 const T * __restrict__ d,
605 const int n) {
606
607 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608 const int str = blockDim.x * gridDim.x;
609
610 for (int i = idx; i < n; i += str) {
611 a[i] = a[i] + b[i] * c[i] * d[i];
612 }
613
614}
615
619template< typename T >
621 const T * __restrict__ b,
622 const T * __restrict__ c,
623 const T s,
624 const int n) {
625
626 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
627 const int str = blockDim.x * gridDim.x;
628
629 for (int i = idx; i < n; i += str) {
630 a[i] = a[i] + s * b[i] * c[i];
631 }
632
633}
634
638template< typename T >
640 const T * __restrict__ u1,
641 const T * __restrict__ u2,
642 const T * __restrict__ u3,
643 const T * __restrict__ v1,
644 const T * __restrict__ v2,
645 const T * __restrict__ v3,
646 const int n) {
647
648 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
649 const int str = blockDim.x * gridDim.x;
650
651 for (int i = idx; i < n; i += str) {
652 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
653 }
654
655}
656
660template< typename T >
662 T * __restrict__ u2,
663 T * __restrict__ u3,
664 const T * __restrict__ v1,
665 const T * __restrict__ v2,
666 const T * __restrict__ v3,
667 const T * __restrict__ w1,
668 const T * __restrict__ w2,
669 const T * __restrict__ w3,
670 const int n) {
671
672 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
673 const int str = blockDim.x * gridDim.x;
674
675 for (int i = idx; i < n; i += str) {
676 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
677 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
678 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
679 }
680}
681
685template< typename T>
687 val += __shfl_down(val, 32);
688 val += __shfl_down(val, 16);
689 val += __shfl_down(val, 8);
690 val += __shfl_down(val, 4);
691 val += __shfl_down(val, 2);
692 val += __shfl_down(val, 1);
693 return val;
694}
695
699template< typename T>
701 val = max(val, __shfl_down(val, 32));
702 val = max(val, __shfl_down(val, 16));
703 val = max(val, __shfl_down(val, 8));
704 val = max(val, __shfl_down(val, 4));
705 val = max(val, __shfl_down(val, 2));
706 val = max(val, __shfl_down(val, 1));
707 return val;
708}
709
713template< typename T>
715 val = min(val, __shfl_down(val, 32));
716 val = min(val, __shfl_down(val, 16));
717 val = min(val, __shfl_down(val, 8));
718 val = min(val, __shfl_down(val, 4));
719 val = min(val, __shfl_down(val, 2));
720 val = min(val, __shfl_down(val, 1));
721 return val;
722}
723
727template< typename T >
728__global__ void reduce_kernel(T * bufred, const int n) {
729
730 T sum = 0;
731 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
732 const int str = blockDim.x * gridDim.x;
733 for (int i = idx; i<n ; i += str)
734 {
735 sum += bufred[i];
736 }
737
738 __shared__ T shared[64];
739 unsigned int lane = threadIdx.x % warpSize;
740 unsigned int wid = threadIdx.x / warpSize;
741
743 if (lane == 0)
744 shared[wid] = sum;
746
747 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
748 if (wid == 0)
750
751 if (threadIdx.x == 0)
752 bufred[blockIdx.x] = sum;
753}
754
758template< typename T >
759__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
760
761 T max_val = ninf;
762 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
763 const int str = blockDim.x * gridDim.x;
764 for (int i = idx; i<n ; i += str)
765 {
767 }
768
769 __shared__ T shared[64];
770 unsigned int lane = threadIdx.x % warpSize;
771 unsigned int wid = threadIdx.x / warpSize;
772
774 if (lane == 0)
775 shared[wid] = max_val;
777
779 if (wid == 0)
781
782 if (threadIdx.x == 0)
784}
785
789template< typename T >
790__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
791
792 T min_val = pinf;
793 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
794 const int str = blockDim.x * gridDim.x;
795 for (int i = idx; i<n ; i += str)
796 {
798 }
799
800 __shared__ T shared[64];
801 unsigned int lane = threadIdx.x % warpSize;
802 unsigned int wid = threadIdx.x / warpSize;
803
805 if (lane == 0)
806 shared[wid] = min_val;
808
810 if (wid == 0)
812
813 if (threadIdx.x == 0)
815}
816
821template< typename T >
823 const int n,
824 const int j
825 ) {
826 __shared__ T buf[1024] ;
827 const int idx = threadIdx.x;
828 const int y= blockIdx.x;
829 const int step = blockDim.x;
830
831 buf[idx]=0;
832 for (int i=idx ; i<n ; i+=step)
833 {
834 buf[idx] += bufred[i*j + y];
835 }
837
838 int i = 512;
839 while (i != 0)
840 {
841 if(threadIdx.x < i && (threadIdx.x + i) < n )
842 {
843 buf[threadIdx.x] += buf[threadIdx.x + i] ;
844 }
845 i = i>>1;
847 }
848
849 bufred[y] = buf[0];
850}
851
855template< typename T >
857 const T * b,
858 const T * c,
859 T * buf_h,
860 const int n) {
861
862 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
863 const int str = blockDim.x * gridDim.x;
864
865 const unsigned int lane = threadIdx.x % warpSize;
866 const unsigned int wid = threadIdx.x / warpSize;
867
868 __shared__ T shared[64];
869 T sum = 0.0;
870 for (int i = idx; i < n; i+= str) {
871 sum += a[i] * b[i] * c[i];
872 }
873
875 if (lane == 0)
876 shared[wid] = sum;
878
879 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
880 if (wid == 0)
882
883 if (threadIdx.x == 0)
884 buf_h[blockIdx.x] = sum;
885}
886
890template< typename T >
892 const T ** b,
893 const T * c,
894 T * buf_h,
895 const int j,
896 const int n) {
897
898 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
899 const int str = blockDim.y * gridDim.x;
900 const int y = threadIdx.x;
901
902 __shared__ T buf[1024];
903 T tmp = 0;
904 if(y < j){
905 for (int i = idx; i < n; i+= str) {
906 tmp += a[i] * b[threadIdx.x][i] * c[i];
907 }
908 }
909
910 buf[threadIdx.y*blockDim.x+y] = tmp;
912
913 int i = blockDim.y>>1;
914 while (i != 0) {
915 if (threadIdx.y < i) {
916 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
917 }
919 i = i>>1;
920 }
921 if (threadIdx.y == 0) {
922 if( y < j) {
923 buf_h[j*blockIdx.x+y] = buf[y];
924 }
925 }
926}
927
931template< typename T >
933 const T * b,
934 T * buf_h,
935 const int n) {
936
937 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
938 const int str = blockDim.x * gridDim.x;
939
940 const unsigned int lane = threadIdx.x % warpSize;
941 const unsigned int wid = threadIdx.x / warpSize;
942
943 __shared__ T shared[64];
944 T sum = 0.0;
945 for (int i = idx; i < n; i+= str) {
946 sum += a[i] * b[i];
947 }
948
950 if (lane == 0)
951 shared[wid] = sum;
953
954 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
955 if (wid == 0)
957
958 if (threadIdx.x == 0)
959 buf_h[blockIdx.x] = sum;
960
961}
962
966template< typename T >
968 const T * b,
969 T * buf_h,
970 const int n) {
971
972 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
973 const int str = blockDim.x * gridDim.x;
974
975 const unsigned int lane = threadIdx.x % warpSize;
976 const unsigned int wid = threadIdx.x / warpSize;
977
978 __shared__ T shared[64];
979 T sum = 0.0;
980 for (int i = idx; i < n; i+= str) {
981 sum += pow(a[i] - b[i], 2.0);
982 }
983
985 if (lane == 0)
986 shared[wid] = sum;
988
989 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
990 if (wid == 0)
992
993 if (threadIdx.x == 0)
994 buf_h[blockIdx.x] = sum;
995
996}
997
1001template< typename T >
1003 T * buf_h,
1004 const int n) {
1005
1006 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1007 const int str = blockDim.x * gridDim.x;
1008
1009 const unsigned int lane = threadIdx.x % warpSize;
1010 const unsigned int wid = threadIdx.x / warpSize;
1011
1012 __shared__ T shared[64];
1013 T sum = 0;
1014 for (int i = idx; i<n ; i += str)
1015 {
1016 sum += a[i];
1017 }
1018
1020 if (lane == 0)
1021 shared[wid] = sum;
1022 __syncthreads();
1023
1024 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1025 if (wid == 0)
1027
1028 if (threadIdx.x == 0)
1029 buf_h[blockIdx.x] = sum;
1030
1031}
1032
1036template< typename T >
1038 const T ninf,
1039 T * buf_h,
1040 const int n) {
1041
1042 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1043 const int str = blockDim.x * gridDim.x;
1044
1045 const unsigned int lane = threadIdx.x % warpSize;
1046 const unsigned int wid = threadIdx.x / warpSize;
1047
1048 __shared__ T shared[64];
1049 T max_val = ninf;
1050 for (int i = idx; i<n ; i += str)
1051 {
1052 max_val = max(max_val, a[i]);
1053 }
1054
1056 if (lane == 0)
1057 shared[wid] = max_val;
1058 __syncthreads();
1059
1061 if (wid == 0)
1063
1064 if (threadIdx.x == 0)
1065 buf_h[blockIdx.x] = max_val;
1066
1067}
1068
1072template< typename T >
1074 const T pinf,
1075 T * buf_h,
1076 const int n) {
1077
1078 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1079 const int str = blockDim.x * gridDim.x;
1080
1081 const unsigned int lane = threadIdx.x % warpSize;
1082 const unsigned int wid = threadIdx.x / warpSize;
1083
1084 __shared__ T shared[64];
1085 T min_val = pinf;
1086 for (int i = idx; i<n ; i += str)
1087 {
1088 min_val = min(min_val, a[i]);
1089 }
1090
1092 if (lane == 0)
1093 shared[wid] = min_val;
1094 __syncthreads();
1095
1097 if (wid == 0)
1099
1100 if (threadIdx.x == 0)
1101 buf_h[blockIdx.x] = min_val;
1102
1103}
1104
1108template< typename T >
1110 const int n) {
1111
1112 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1113 const int str = blockDim.x * gridDim.x;
1114
1115 for (int i = idx; i < n; i += str) {
1116 a[i] = fabs(a[i]);
1117 }
1118}
1119
1120// ========================================================================== //
1121// Kernels for the point-wise operations
1122
1127template <typename T>
1128__global__ void
1129 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1130
1131 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1132 const int str = blockDim.x * gridDim.x;
1133
1134 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1135}
1136
1141template <typename T>
1143 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1144 const int n) {
1145
1146 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1147 const int str = blockDim.x * gridDim.x;
1148
1149 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1150}
1151
1156template <typename T>
1157__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1158
1159 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1160 const int str = blockDim.x * gridDim.x;
1161
1162 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1163}
1164
1169template <typename T>
1171 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1172
1173 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1174 const int str = blockDim.x * gridDim.x;
1175
1176 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1177}
1178
1183template <typename T>
1184__global__ void
1185 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1186
1187 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1188 const int str = blockDim.x * gridDim.x;
1189
1190 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1191}
1192
1197template <typename T>
1199 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1200 const int n) {
1201
1202 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1203 const int str = blockDim.x * gridDim.x;
1204
1205 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1206}
1207
1212template <typename T>
1213__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1214
1215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1216 const int str = blockDim.x * gridDim.x;
1217
1218 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1219}
1220
1225template <typename T>
1227 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1228
1229 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1230 const int str = blockDim.x * gridDim.x;
1231
1232 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1233}
1234
1235#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:95
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:614
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40