Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[i] = b[mask[i]];
86 }
87}
88
90void face_gather_nonlinear_index(int *index, const int idx, const int lx,
91 const int ly, const int lz) {
92 const int idx2 = idx - 1;
93 index[3] = idx2 / (lx * ly * lz);
94 index[2] = (idx2 - (lx * ly * lz) * index[3]) / (lx * ly);
95 index[1] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) / lx;
96 index[0] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) -
97 lx * index[1];
98 index[0]++;
99 index[1]++;
100 index[2]++;
101 index[3]++;
102}
103
105int face_gather_idx(const int i, const int j, const int k, const int l,
106 const int n1, const int n2, const int nf) {
107 return ((i) + (n1) * (((j) - 1) + (n2) * (((k) - 1) + (nf) * (((l) - 1))))) - 1;
108}
109
113template< typename T >
115 const T * __restrict__ b,
116 const int * __restrict__ mask,
117 const int * __restrict__ facet,
118 const int n1,
119 const int n2,
120 const int lx,
121 const int ly,
122 const int lz,
123 const int n_mask) {
124 int index[4];
125
126 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
127 const int str = blockDim.x * gridDim.x;
128
129 for (int m = idx; m < n_mask; m += str) {
130 const int f = facet[m + 1];
131 face_gather_nonlinear_index(index, mask[m + 1], lx, ly, lz);
132
133 switch (f) {
134 case 1:
135 case 2:
136 a[m] = b[face_gather_idx(index[1], index[2], f, index[3], n1, n2, 6)];
137 break;
138 case 3:
139 case 4:
140 a[m] = b[face_gather_idx(index[0], index[2], f, index[3], n1, n2, 6)];
141 break;
142 case 5:
143 case 6:
144 a[m] = b[face_gather_idx(index[0], index[1], f, index[3], n1, n2, 6)];
145 break;
146 }
147 }
148}
149
153template< typename T >
155 T * __restrict__ b,
156 int * __restrict__ mask,
157 const int n,
158 const int n_mask) {
159
160 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
161 const int str = blockDim.x * gridDim.x;
162
163 for (int i = idx; i < n_mask; i += str) {
164 a[mask[i+1]-1] = b[i];
165 }
166}
167
171template< typename T >
173 T * __restrict__ b,
174 int * __restrict__ mask,
175 const int n,
176 const int n_mask) {
177
178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
179 const int str = blockDim.x * gridDim.x;
180
181 for (int i = idx; i < n_mask; i += str) {
182 a[mask[i]] = b[i];
183 }
184}
185
189template< typename T >
191 T * __restrict__ b,
192 int * __restrict__ mask,
193 const int n,
194 const int n_mask) {
195
196 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
197 const int str = blockDim.x * gridDim.x;
198
199 for (int i = idx; i < n_mask; i += str) {
200 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
201 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
202 }
203}
204
208template< typename T >
210 T * __restrict__ b,
211 int * __restrict__ mask,
212 const int n,
213 const int n_mask) {
214
215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216 const int str = blockDim.x * gridDim.x;
217
218 for (int i = idx; i < n_mask; i += str) {
219 a[mask[i+1]-1] = b[mask[i+1]-1];
220 }
221}
222
226template< typename T >
228 const T c,
229 const int n,
230 int* __restrict__ mask,
231 const int n_mask) {
232
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
235
236 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
237}
238
242template< typename T >
244 T * __restrict__ b,
245 const T c,
246 const int n) {
247
248 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
249 const int str = blockDim.x * gridDim.x;
250
251 for (int i = idx; i < n; i += str) {
252 a[i] = c * b[i];
253 }
254}
255
259template< typename T >
261 const T c,
262 const int n) {
263
264 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
265 const int str = blockDim.x * gridDim.x;
266
267 for (int i = idx; i < n; i += str) {
268 a[i] = c / a[i];
269 }
270}
271
275template< typename T >
277 T * __restrict__ b,
278 const T c,
279 const int n) {
280
281 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
282 const int str = blockDim.x * gridDim.x;
283
284 for (int i = idx; i < n; i += str) {
285 a[i] = c / b[i];
286 }
287}
288
292template< typename T >
294 const T c,
295 const int n) {
296
297 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
298 const int str = blockDim.x * gridDim.x;
299
300 for (int i = idx; i < n; i += str) {
301 a[i] = a[i] + c;
302 }
303}
304
308template< typename T >
310 T * __restrict__ b,
311 const T c,
312 const int n) {
313
314 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
315 const int str = blockDim.x * gridDim.x;
316
317 for (int i = idx; i < n; i += str) {
318 a[i] = b[i] + c;
319 }
320}
321
325template< typename T >
327 const T min_val,
328 const T max_val,
329 const int n) {
330
331 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
332 const int str = blockDim.x * gridDim.x;
333 const T l = max_val - min_val;
334
335 for (int i = idx; i < n; i += str) {
336 a[i] = min_val + fmod(fmod(a[i] - min_val, l) + l, l);
337 }
338}
339
343template< typename T >
345 const T c,
346 const int n) {
347
348 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
349 const int str = blockDim.x * gridDim.x;
350
351 for (int i = idx; i < n; i += str) {
352 a[i] = c;
353 }
354}
355
359template< typename T >
361 const T * __restrict__ b,
362 const int n) {
363
364 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
365 const int str = blockDim.x * gridDim.x;
366
367 for (int i = idx; i < n; i += str) {
368 a[i] = a[i] + b[i];
369 }
370}
371
375template< typename T >
377 const T * __restrict__ b,
378 const T * __restrict__ c,
379 const int n) {
380
381 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
382 const int str = blockDim.x * gridDim.x;
383
384 for (int i = idx; i < n; i += str) {
385 a[i] = b[i] + c[i];
386 }
387}
388
392template< typename T >
394 const T * __restrict__ b,
395 const T * __restrict__ c,
396 const T * __restrict__ d,
397 const int n) {
398
399 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
400 const int str = blockDim.x * gridDim.x;
401
402 for (int i = idx; i < n; i += str) {
403 a[i] = b[i] + c[i] + d[i];
404 }
405}
406
410template< typename T >
412 const T * __restrict__ b,
413 const T c1,
414 const int n) {
415
416 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
417 const int str = blockDim.x * gridDim.x;
418
419 for (int i = idx; i < n; i += str) {
420 a[i] = c1 * a[i] + b[i];
421 }
422}
423
427template< typename T >
429 const T ** p,
430 const T * alpha,
431 const int p_cur,
432 const int n) {
433
434 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
435 const int str = blockDim.x * gridDim.x;
436
437
438 for (int i = idx; i < n; i+= str) {
439 T tmp = 0.0;
440 for (int j = 0; j < p_cur; j ++) {
441 tmp += p[j][i]*alpha[j];
442 }
443 x[i] += tmp;
444 }
445}
446
450template< typename T >
452 const T * __restrict__ b,
453 const T c1,
454 const int n) {
455
456 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
457 const int str = blockDim.x * gridDim.x;
458
459 for (int i = idx; i < n; i += str) {
460 a[i] = a[i] + c1 * b[i];
461 }
462}
463
467template< typename T >
469 const T * __restrict__ b,
470 const T c1,
471 const int n) {
472
473 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
474 const int str = blockDim.x * gridDim.x;
475
476 for (int i = idx; i < n; i += str) {
477 a[i] = a[i] + c1 * (b[i] * b[i]);
478 }
479}
480
484template< typename T >
486 const T * __restrict__ b,
487 const T * __restrict__ c,
488 const T c1,
489 const T c2,
490 const int n) {
491
492 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
493 const int str = blockDim.x * gridDim.x;
494
495 for (int i = idx; i < n; i += str) {
496 a[i] = c1 * b[i] + c2 * c[i];
497 }
498}
499
503template< typename T >
505 const T * __restrict__ b,
506 const T * __restrict__ c,
507 const T * __restrict__ d,
508 const T c1,
509 const T c2,
510 const T c3,
511 const int n) {
512
513 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
514 const int str = blockDim.x * gridDim.x;
515
516 for (int i = idx; i < n; i += str) {
517 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
518 }
519}
520
524template< typename T >
526 const T * __restrict__ b,
527 const T * __restrict__ c,
528 const T * __restrict__ d,
529 const T * __restrict__ e,
530 const T c1,
531 const T c2,
532 const T c3,
533 const T c4,
534 const int n) {
535
536 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
537 const int str = blockDim.x * gridDim.x;
538
539 for (int i = idx; i < n; i += str) {
540 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
541 }
542}
543
547template< typename T >
549 const int n) {
550
551 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
552 const int str = blockDim.x * gridDim.x;
553 const T one = 1.0;
554
555 for (int i = idx; i < n; i += str) {
556 a[i] = one / a[i];
557 }
558}
559
563template< typename T >
565 const T * __restrict__ b,
566 const int n) {
567
568 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
569 const int str = blockDim.x * gridDim.x;
570
571 for (int i = idx; i < n; i += str) {
572 a[i] = a[i] / b[i];
573 }
574}
575
579template< typename T >
581 const T * __restrict__ b,
582 const T * __restrict__ c,
583 const int n) {
584
585 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
586 const int str = blockDim.x * gridDim.x;
587
588 for (int i = idx; i < n; i += str) {
589 a[i] = b[i] / c[i];
590 }
591}
592
596template< typename T >
598 const T * __restrict__ b,
599 const int n) {
600
601 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
602 const int str = blockDim.x * gridDim.x;
603
604 for (int i = idx; i < n; i += str) {
605 a[i] = a[i] * b[i];
606 }
607}
608
612template< typename T >
614 const T * __restrict__ b,
615 const T * __restrict__ c,
616 const int n) {
617
618 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
619 const int str = blockDim.x * gridDim.x;
620
621 for (int i = idx; i < n; i += str) {
622 a[i] = b[i] * c[i];
623 }
624}
625
629template< typename T >
631 const T * __restrict__ b,
632 const T * __restrict__ c,
633 const int n) {
634
635 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
636 const int str = blockDim.x * gridDim.x;
637
638 for (int i = idx; i < n; i += str) {
639 a[i] = a[i] - b[i] * c[i];
640 }
641}
642
646template< typename T >
648 const T * __restrict__ b,
649 const int n) {
650
651 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
652 const int str = blockDim.x * gridDim.x;
653
654 for (int i = idx; i < n; i += str) {
655 a[i] = a[i] - b[i];
656 }
657}
658
662template< typename T >
664 const T * __restrict__ b,
665 const T * __restrict__ c,
666 const int n) {
667
668 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
669 const int str = blockDim.x * gridDim.x;
670
671 for (int i = idx; i < n; i += str) {
672 a[i] = b[i] - c[i];
673 }
674}
675
679template< typename T >
681 const T * __restrict__ b,
682 const T * __restrict__ c,
683 const int n) {
684
685 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
686 const int str = blockDim.x * gridDim.x;
687
688 for (int i = idx; i < n; i += str) {
689 a[i] = a[i] + b[i] * c[i];
690 }
691
692}
693
697template< typename T >
699 const T * __restrict__ b,
700 const T * __restrict__ c,
701 const T * __restrict__ d,
702 const int n) {
703
704 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
705 const int str = blockDim.x * gridDim.x;
706
707 for (int i = idx; i < n; i += str) {
708 a[i] = a[i] + b[i] * c[i] * d[i];
709 }
710
711}
712
716template< typename T >
718 const T * __restrict__ b,
719 const T * __restrict__ c,
720 const T s,
721 const int n) {
722
723 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
724 const int str = blockDim.x * gridDim.x;
725
726 for (int i = idx; i < n; i += str) {
727 a[i] = a[i] + s * b[i] * c[i];
728 }
729
730}
731
735template< typename T >
737 const T * __restrict__ u1,
738 const T * __restrict__ u2,
739 const T * __restrict__ u3,
740 const T * __restrict__ v1,
741 const T * __restrict__ v2,
742 const T * __restrict__ v3,
743 const int n) {
744
745 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
746 const int str = blockDim.x * gridDim.x;
747
748 for (int i = idx; i < n; i += str) {
749 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
750 }
751
752}
753
757template< typename T >
759 T * __restrict__ u2,
760 T * __restrict__ u3,
761 const T * __restrict__ v1,
762 const T * __restrict__ v2,
763 const T * __restrict__ v3,
764 const T * __restrict__ w1,
765 const T * __restrict__ w2,
766 const T * __restrict__ w3,
767 const int n) {
768
769 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
770 const int str = blockDim.x * gridDim.x;
771
772 for (int i = idx; i < n; i += str) {
773 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
774 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
775 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
776 }
777}
778
782template< typename T>
784 val += __shfl_down(val, 32);
785 val += __shfl_down(val, 16);
786 val += __shfl_down(val, 8);
787 val += __shfl_down(val, 4);
788 val += __shfl_down(val, 2);
789 val += __shfl_down(val, 1);
790 return val;
791}
792
796template< typename T>
798 val = max(val, __shfl_down(val, 32));
799 val = max(val, __shfl_down(val, 16));
800 val = max(val, __shfl_down(val, 8));
801 val = max(val, __shfl_down(val, 4));
802 val = max(val, __shfl_down(val, 2));
803 val = max(val, __shfl_down(val, 1));
804 return val;
805}
806
810template< typename T>
812 val = min(val, __shfl_down(val, 32));
813 val = min(val, __shfl_down(val, 16));
814 val = min(val, __shfl_down(val, 8));
815 val = min(val, __shfl_down(val, 4));
816 val = min(val, __shfl_down(val, 2));
817 val = min(val, __shfl_down(val, 1));
818 return val;
819}
820
824template< typename T >
825__global__ void reduce_kernel(T * bufred, const int n) {
826
827 T sum = 0;
828 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
829 const int str = blockDim.x * gridDim.x;
830 for (int i = idx; i<n ; i += str)
831 {
832 sum += bufred[i];
833 }
834
835 __shared__ T shared[64];
836 unsigned int lane = threadIdx.x % warpSize;
837 unsigned int wid = threadIdx.x / warpSize;
838
840 if (lane == 0)
841 shared[wid] = sum;
843
844 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
845 if (wid == 0)
847
848 if (threadIdx.x == 0)
849 bufred[blockIdx.x] = sum;
850}
851
855template< typename T >
856__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
857
858 T max_val = ninf;
859 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
860 const int str = blockDim.x * gridDim.x;
861 for (int i = idx; i<n ; i += str)
862 {
864 }
865
866 __shared__ T shared[64];
867 unsigned int lane = threadIdx.x % warpSize;
868 unsigned int wid = threadIdx.x / warpSize;
869
871 if (lane == 0)
872 shared[wid] = max_val;
874
876 if (wid == 0)
878
879 if (threadIdx.x == 0)
881}
882
886template< typename T >
887__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
888
889 T min_val = pinf;
890 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
891 const int str = blockDim.x * gridDim.x;
892 for (int i = idx; i<n ; i += str)
893 {
895 }
896
897 __shared__ T shared[64];
898 unsigned int lane = threadIdx.x % warpSize;
899 unsigned int wid = threadIdx.x / warpSize;
900
902 if (lane == 0)
903 shared[wid] = min_val;
905
907 if (wid == 0)
909
910 if (threadIdx.x == 0)
912}
913
918template< typename T >
920 const int n,
921 const int j
922 ) {
923 __shared__ T buf[1024] ;
924 const int idx = threadIdx.x;
925 const int y= blockIdx.x;
926 const int step = blockDim.x;
927
928 buf[idx]=0;
929 for (int i=idx ; i<n ; i+=step)
930 {
931 buf[idx] += bufred[i*j + y];
932 }
934
935 int i = 512;
936 while (i != 0)
937 {
938 if(threadIdx.x < i && (threadIdx.x + i) < n )
939 {
940 buf[threadIdx.x] += buf[threadIdx.x + i] ;
941 }
942 i = i>>1;
944 }
945
946 bufred[y] = buf[0];
947}
948
952template< typename T >
954 const T * b,
955 const T * c,
956 T * buf_h,
957 const int n) {
958
959 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
960 const int str = blockDim.x * gridDim.x;
961
962 const unsigned int lane = threadIdx.x % warpSize;
963 const unsigned int wid = threadIdx.x / warpSize;
964
965 __shared__ T shared[64];
966 T sum = 0.0;
967 for (int i = idx; i < n; i+= str) {
968 sum += a[i] * b[i] * c[i];
969 }
970
972 if (lane == 0)
973 shared[wid] = sum;
975
976 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
977 if (wid == 0)
979
980 if (threadIdx.x == 0)
981 buf_h[blockIdx.x] = sum;
982}
983
987template< typename T >
989 const T ** b,
990 const T * c,
991 T * buf_h,
992 const int j,
993 const int n) {
994
995 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
996 const int str = blockDim.y * gridDim.x;
997 const int y = threadIdx.x;
998
999 __shared__ T buf[1024];
1000 T tmp = 0;
1001 if(y < j){
1002 for (int i = idx; i < n; i+= str) {
1003 tmp += a[i] * b[threadIdx.x][i] * c[i];
1004 }
1005 }
1006
1007 buf[threadIdx.y*blockDim.x+y] = tmp;
1008 __syncthreads();
1009
1010 int i = blockDim.y>>1;
1011 while (i != 0) {
1012 if (threadIdx.y < i) {
1013 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
1014 }
1015 __syncthreads();
1016 i = i>>1;
1017 }
1018 if (threadIdx.y == 0) {
1019 if( y < j) {
1020 buf_h[j*blockIdx.x+y] = buf[y];
1021 }
1022 }
1023}
1024
1028template< typename T >
1030 const T * b,
1031 T * buf_h,
1032 const int n) {
1033
1034 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1035 const int str = blockDim.x * gridDim.x;
1036
1037 const unsigned int lane = threadIdx.x % warpSize;
1038 const unsigned int wid = threadIdx.x / warpSize;
1039
1040 __shared__ T shared[64];
1041 T sum = 0.0;
1042 for (int i = idx; i < n; i+= str) {
1043 sum += a[i] * b[i];
1044 }
1045
1047 if (lane == 0)
1048 shared[wid] = sum;
1049 __syncthreads();
1050
1051 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1052 if (wid == 0)
1054
1055 if (threadIdx.x == 0)
1056 buf_h[blockIdx.x] = sum;
1057
1058}
1059
1063template< typename T >
1065 const T * b,
1066 T * buf_h,
1067 const int n) {
1068
1069 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1070 const int str = blockDim.x * gridDim.x;
1071
1072 const unsigned int lane = threadIdx.x % warpSize;
1073 const unsigned int wid = threadIdx.x / warpSize;
1074
1075 __shared__ T shared[64];
1076 T sum = 0.0;
1077 for (int i = idx; i < n; i+= str) {
1078 sum += pow(a[i] - b[i], 2.0);
1079 }
1080
1082 if (lane == 0)
1083 shared[wid] = sum;
1084 __syncthreads();
1085
1086 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1087 if (wid == 0)
1089
1090 if (threadIdx.x == 0)
1091 buf_h[blockIdx.x] = sum;
1092
1093}
1094
1098template< typename T >
1100 T * buf_h,
1101 const int n) {
1102
1103 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1104 const int str = blockDim.x * gridDim.x;
1105
1106 const unsigned int lane = threadIdx.x % warpSize;
1107 const unsigned int wid = threadIdx.x / warpSize;
1108
1109 __shared__ T shared[64];
1110 T sum = 0;
1111 for (int i = idx; i<n ; i += str)
1112 {
1113 sum += a[i];
1114 }
1115
1117 if (lane == 0)
1118 shared[wid] = sum;
1119 __syncthreads();
1120
1121 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1122 if (wid == 0)
1124
1125 if (threadIdx.x == 0)
1126 buf_h[blockIdx.x] = sum;
1127
1128}
1129
1133template< typename T >
1135 const T ninf,
1136 T * buf_h,
1137 const int n) {
1138
1139 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1140 const int str = blockDim.x * gridDim.x;
1141
1142 const unsigned int lane = threadIdx.x % warpSize;
1143 const unsigned int wid = threadIdx.x / warpSize;
1144
1145 __shared__ T shared[64];
1146 T max_val = ninf;
1147 for (int i = idx; i<n ; i += str)
1148 {
1149 max_val = max(max_val, a[i]);
1150 }
1151
1153 if (lane == 0)
1154 shared[wid] = max_val;
1155 __syncthreads();
1156
1158 if (wid == 0)
1160
1161 if (threadIdx.x == 0)
1162 buf_h[blockIdx.x] = max_val;
1163
1164}
1165
1169template< typename T >
1171 const T pinf,
1172 T * buf_h,
1173 const int n) {
1174
1175 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1176 const int str = blockDim.x * gridDim.x;
1177
1178 const unsigned int lane = threadIdx.x % warpSize;
1179 const unsigned int wid = threadIdx.x / warpSize;
1180
1181 __shared__ T shared[64];
1182 T min_val = pinf;
1183 for (int i = idx; i<n ; i += str)
1184 {
1185 min_val = min(min_val, a[i]);
1186 }
1187
1189 if (lane == 0)
1190 shared[wid] = min_val;
1191 __syncthreads();
1192
1194 if (wid == 0)
1196
1197 if (threadIdx.x == 0)
1198 buf_h[blockIdx.x] = min_val;
1199
1200}
1201
1205template< typename T >
1207 const int n) {
1208
1209 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1210 const int str = blockDim.x * gridDim.x;
1211
1212 for (int i = idx; i < n; i += str) {
1213 a[i] = fabs(a[i]);
1214 }
1215}
1216
1217// ========================================================================== //
1218// Kernels for the point-wise operations
1219
1224template <typename T>
1225__global__ void
1226 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1227
1228 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1229 const int str = blockDim.x * gridDim.x;
1230
1231 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1232}
1233
1238template <typename T>
1240 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1241 const int n) {
1242
1243 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1244 const int str = blockDim.x * gridDim.x;
1245
1246 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1247}
1248
1253template <typename T>
1254__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1255
1256 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1257 const int str = blockDim.x * gridDim.x;
1258
1259 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1260}
1261
1266template <typename T>
1268 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1269
1270 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1271 const int str = blockDim.x * gridDim.x;
1272
1273 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1274}
1275
1280template <typename T>
1281__global__ void
1282 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1283
1284 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1285 const int str = blockDim.x * gridDim.x;
1286
1287 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1288}
1289
1294template <typename T>
1296 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1297 const int n) {
1298
1299 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1300 const int str = blockDim.x * gridDim.x;
1301
1302 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1303}
1304
1309template <typename T>
1310__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1311
1312 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1313 const int str = blockDim.x * gridDim.x;
1314
1315 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1316}
1317
1322template <typename T>
1324 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1325
1326 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1327 const int str = blockDim.x * gridDim.x;
1328
1329 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1330}
1331
1332#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cwrap_kernel(T *__restrict__ a, const T min_val, const T max_val, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_scatter_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void face_masked_gather_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int *__restrict__ mask, const int *__restrict__ facet, const int n1, const int n2, const int lx, const int ly, const int lz, const int n_mask)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__device__ __forceinline__ void face_gather_nonlinear_index(int *index, const int idx, const int lx, const int ly, const int lz)
Definition math_kernel.h:91
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__device__ __forceinline__ int face_gather_idx(const int i, const int j, const int k, const int l, const int n1, const int n2, const int nf)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:662
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40