Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[i] = b[mask[i]];
87 }
88}
89
91void face_gather_nonlinear_index(int *index, const int idx, const int lx,
92 const int ly, const int lz) {
93 const int idx2 = idx - 1;
94 index[3] = idx2 / (lx * ly * lz);
95 index[2] = (idx2 - (lx * ly * lz) * index[3]) / (lx * ly);
96 index[1] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) / lx;
97 index[0] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) -
98 lx * index[1];
99 index[0]++;
100 index[1]++;
101 index[2]++;
102 index[3]++;
103}
104
106int face_gather_idx(const int i, const int j, const int k, const int l,
107 const int n1, const int n2, const int nf) {
108 return ((i) + (n1) * (((j) - 1) + (n2) * (((k) - 1) + (nf) * (((l) - 1))))) - 1;
109}
110
114template< typename T >
116 const T * __restrict__ b,
117 const int * __restrict__ mask,
118 const int * __restrict__ facet,
119 const int n1,
120 const int n2,
121 const int lx,
122 const int ly,
123 const int lz,
124 const int n_mask) {
125 int index[4];
126
127 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
128 const int str = blockDim.x * gridDim.x;
129
130 for (int m = idx; m < n_mask; m += str) {
131 const int f = facet[m + 1];
132 face_gather_nonlinear_index(index, mask[m + 1], lx, ly, lz);
133
134 switch (f) {
135 case 1:
136 case 2:
137 a[m] = b[face_gather_idx(index[1], index[2], f, index[3], n1, n2, 6)];
138 break;
139 case 3:
140 case 4:
141 a[m] = b[face_gather_idx(index[0], index[2], f, index[3], n1, n2, 6)];
142 break;
143 case 5:
144 case 6:
145 a[m] = b[face_gather_idx(index[0], index[1], f, index[3], n1, n2, 6)];
146 break;
147 }
148 }
149}
150
151
155template< typename T >
157 T * __restrict__ b,
158 int * __restrict__ mask,
159 const int n,
160 const int n_mask) {
161
162 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
163 const int str = blockDim.x * gridDim.x;
164
165 for (int i = idx; i < n_mask; i += str) {
166 a[mask[i+1]-1] = b[i];
167 }
168}
169
173template< typename T >
175 T * __restrict__ b,
176 int * __restrict__ mask,
177 const int n,
178 const int n_mask) {
179
180 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
181 const int str = blockDim.x * gridDim.x;
182
183 for (int i = idx; i < n_mask; i += str) {
184 a[mask[i]] = b[i];
185 }
186}
187
188
192template< typename T >
194 T * __restrict__ b,
195 int * __restrict__ mask,
196 const int n,
197 const int m) {
198
199 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
200 const int str = blockDim.x * gridDim.x;
201
202 for (int i = idx; i < m; i += str) {
203#if __CUDA_ARCH__ >= 600
204 atomicAdd( &(a[mask[i+1]-1]), b[i]);
205#endif
206 }
207}
208
212template< typename T >
214 T * __restrict__ b,
215 int * __restrict__ mask,
216 const int n,
217 const int n_mask) {
218
219 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
220 const int str = blockDim.x * gridDim.x;
221
222 for (int i = idx; i < n_mask; i += str) {
223 a[mask[i+1]-1] = b[mask[i+1]-1];
224 }
225}
226
230template< typename T >
232 T * __restrict__ b,
233 int * __restrict__ mask,
234 const int n,
235 const int n_mask) {
236
237 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
238 const int str = blockDim.x * gridDim.x;
239
240 for (int i = idx; i < n_mask; i += str) {
241 a[mask[i]] = b[mask[i]];
242 }
243}
244
248template <typename T>
250 const T c,
251 const int size,
252 int* __restrict__ mask,
253 const int mask_size) {
254
255 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
256 const int str = blockDim.x * gridDim.x;
257
258 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
259}
260
264template< typename T >
266 T * __restrict__ b,
267 const T c,
268 const int n) {
269
270 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
271 const int str = blockDim.x * gridDim.x;
272
273 for (int i = idx; i < n; i += str) {
274 a[i] = c * b[i];
275 }
276}
277
281template< typename T >
283 const T c,
284 const int n) {
285
286 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
287 const int str = blockDim.x * gridDim.x;
288
289 for (int i = idx; i < n; i += str) {
290 a[i] = c / a[i];
291 }
292}
293
297template< typename T >
299 T * __restrict__ b,
300 const T c,
301 const int n) {
302
303 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
304 const int str = blockDim.x * gridDim.x;
305
306 for (int i = idx; i < n; i += str) {
307 a[i] = c / b[i];
308 }
309}
310
314template< typename T >
316 const T c,
317 const int n) {
318
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
321
322 for (int i = idx; i < n; i += str) {
323 a[i] = a[i] + c;
324 }
325}
326
330template< typename T >
332 T * __restrict__ b,
333 const T c,
334 const int n) {
335
336 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
337 const int str = blockDim.x * gridDim.x;
338
339 for (int i = idx; i < n; i += str) {
340 a[i] = b[i] + c;
341 }
342}
343
347template< typename T >
349 const T min_val,
350 const T max_val,
351 const int n) {
352
353 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
354 const int str = blockDim.x * gridDim.x;
355 const T l = max_val - min_val;
356
357 for (int i = idx; i < n; i += str) {
358 a[i] = min_val + fmod(fmod(a[i] - min_val, l) + l, l);
359 }
360}
361
365template< typename T >
367 const T c,
368 const int n) {
369
370 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
371 const int str = blockDim.x * gridDim.x;
372
373 for (int i = idx; i < n; i += str) {
374 a[i] = c;
375 }
376}
377
381template< typename T >
383 const T * __restrict__ b,
384 const int n) {
385
386 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
387 const int str = blockDim.x * gridDim.x;
388
389 for (int i = idx; i < n; i += str) {
390 a[i] = a[i] + b[i];
391 }
392}
393
397template< typename T >
399 const T * __restrict__ b,
400 const T * __restrict__ c,
401 const int n) {
402
403 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
404 const int str = blockDim.x * gridDim.x;
405
406 for (int i = idx; i < n; i += str) {
407 a[i] = b[i] + c[i];
408 }
409}
410
414template< typename T >
416 const T * __restrict__ b,
417 const T * __restrict__ c,
418 const T * __restrict__ d,
419 const int n) {
420
421 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
422 const int str = blockDim.x * gridDim.x;
423
424 for (int i = idx; i < n; i += str) {
425 a[i] = b[i] + c[i] + d[i];
426 }
427}
428
432template< typename T >
434 const T * __restrict__ b,
435 const T c1,
436 const int n) {
437
438 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
439 const int str = blockDim.x * gridDim.x;
440
441 for (int i = idx; i < n; i += str) {
442 a[i] = c1 * a[i] + b[i];
443 }
444}
445
449template< typename T >
451 const T ** p,
452 const T * alpha,
453 const int p_cur,
454 const int n) {
455
456 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
457 const int str = blockDim.x * gridDim.x;
458
459
460 for (int i = idx; i < n; i+= str) {
461 T tmp = 0.0;
462 for (int j = 0; j < p_cur; j ++) {
463 tmp += p[j][i]*alpha[j];
464 }
465 x[i] += tmp;
466 }
467}
468
472template< typename T >
474 const T * __restrict__ b,
475 const T c1,
476 const int n) {
477
478 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
479 const int str = blockDim.x * gridDim.x;
480
481 for (int i = idx; i < n; i += str) {
482 a[i] = a[i] + c1 * b[i];
483 }
484}
485
489template< typename T >
491 const T * __restrict__ b,
492 const T c1,
493 const int n) {
494
495 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
496 const int str = blockDim.x * gridDim.x;
497
498 for (int i = idx; i < n; i += str) {
499 a[i] = a[i] + c1 * (b[i] * b[i]);
500 }
501}
502
506template< typename T >
508 const T * __restrict__ b,
509 const T * __restrict__ c,
510 const T c1,
511 const T c2,
512 const int n) {
513
514 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
515 const int str = blockDim.x * gridDim.x;
516
517 for (int i = idx; i < n; i += str) {
518 a[i] = c1 * b[i] + c2 * c[i];
519 }
520}
521
525template< typename T >
527 const T * __restrict__ b,
528 const T * __restrict__ c,
529 const T * __restrict__ d,
530 const T c1,
531 const T c2,
532 const T c3,
533 const int n) {
534
535 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
536 const int str = blockDim.x * gridDim.x;
537
538 for (int i = idx; i < n; i += str) {
539 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
540 }
541}
542
546template< typename T >
548 const T * __restrict__ b,
549 const T * __restrict__ c,
550 const T * __restrict__ d,
551 const T * __restrict__ e,
552 const T c1,
553 const T c2,
554 const T c3,
555 const T c4,
556 const int n) {
557
558 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
559 const int str = blockDim.x * gridDim.x;
560
561 for (int i = idx; i < n; i += str) {
562 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
563 }
564}
565
569template< typename T >
571 const int n) {
572
573 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
574 const int str = blockDim.x * gridDim.x;
575 const T one = 1.0;
576
577 for (int i = idx; i < n; i += str) {
578 a[i] = one / a[i];
579 }
580}
581
585template< typename T >
587 const T * __restrict__ b,
588 const int n) {
589
590 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
591 const int str = blockDim.x * gridDim.x;
592
593 for (int i = idx; i < n; i += str) {
594 a[i] = a[i] / b[i];
595 }
596}
597
601template< typename T >
603 const T * __restrict__ b,
604 const T * __restrict__ c,
605 const int n) {
606
607 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608 const int str = blockDim.x * gridDim.x;
609
610 for (int i = idx; i < n; i += str) {
611 a[i] = b[i] / c[i];
612 }
613}
614
618template< typename T >
620 const T * __restrict__ b,
621 const int n) {
622
623 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
624 const int str = blockDim.x * gridDim.x;
625
626 for (int i = idx; i < n; i += str) {
627 a[i] = a[i] * b[i];
628 }
629}
630
634template< typename T >
636 const T * __restrict__ b,
637 const T * __restrict__ c,
638 const int n) {
639
640 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
641 const int str = blockDim.x * gridDim.x;
642
643 for (int i = idx; i < n; i += str) {
644 a[i] = b[i] * c[i];
645 }
646}
647
651template< typename T >
653 const T * __restrict__ b,
654 const T * __restrict__ c,
655 const int n) {
656
657 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
658 const int str = blockDim.x * gridDim.x;
659
660 for (int i = idx; i < n; i += str) {
661 a[i] = a[i] - b[i] * c[i];
662 }
663}
664
668template< typename T >
670 const T * __restrict__ b,
671 const int n) {
672
673 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
674 const int str = blockDim.x * gridDim.x;
675
676 for (int i = idx; i < n; i += str) {
677 a[i] = a[i] - b[i];
678 }
679}
680
684template< typename T >
686 const T * __restrict__ b,
687 const T * __restrict__ c,
688 const int n) {
689
690 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
691 const int str = blockDim.x * gridDim.x;
692
693 for (int i = idx; i < n; i += str) {
694 a[i] = b[i] - c[i];
695 }
696}
697
701template< typename T >
703 const T * __restrict__ b,
704 const T * __restrict__ c,
705 const int n) {
706
707 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
708 const int str = blockDim.x * gridDim.x;
709
710 for (int i = idx; i < n; i += str) {
711 a[i] = a[i] + b[i] * c[i];
712 }
713
714}
715
719template< typename T >
721 const T * __restrict__ b,
722 const T * __restrict__ c,
723 const T * __restrict__ d,
724 const int n) {
725
726 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
727 const int str = blockDim.x * gridDim.x;
728
729 for (int i = idx; i < n; i += str) {
730 a[i] = a[i] + b[i] * c[i] * d[i];
731 }
732
733}
734
738template< typename T >
740 const T * __restrict__ b,
741 const T * __restrict__ c,
742 const T s,
743 const int n) {
744
745 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
746 const int str = blockDim.x * gridDim.x;
747
748 for (int i = idx; i < n; i += str) {
749 a[i] = a[i] + s * b[i] * c[i];
750 }
751
752}
753
757template< typename T >
759 const T * __restrict__ u1,
760 const T * __restrict__ u2,
761 const T * __restrict__ u3,
762 const T * __restrict__ v1,
763 const T * __restrict__ v2,
764 const T * __restrict__ v3,
765 const int n) {
766
767 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
768 const int str = blockDim.x * gridDim.x;
769
770 for (int i = idx; i < n; i += str) {
771 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
772 }
773
774}
775
779template< typename T >
781 T * __restrict__ u2,
782 T * __restrict__ u3,
783 const T * __restrict__ v1,
784 const T * __restrict__ v2,
785 const T * __restrict__ v3,
786 const T * __restrict__ w1,
787 const T * __restrict__ w2,
788 const T * __restrict__ w3,
789 const int n) {
790
791 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
792 const int str = blockDim.x * gridDim.x;
793
794 for (int i = idx; i < n; i += str) {
795 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
796 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
797 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
798 }
799
800}
801
802
806template< typename T>
808 val += __shfl_down_sync(0xffffffff, val, 16);
809 val += __shfl_down_sync(0xffffffff, val, 8);
810 val += __shfl_down_sync(0xffffffff, val, 4);
811 val += __shfl_down_sync(0xffffffff, val, 2);
812 val += __shfl_down_sync(0xffffffff, val, 1);
813 return val;
814}
815
819template< typename T>
821 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
822 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
823 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
824 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
825 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
826 return val;
827}
828
832template< typename T>
834 val = min(val, __shfl_down_sync(0xffffffff, val, 16));
835 val = min(val, __shfl_down_sync(0xffffffff, val, 8));
836 val = min(val, __shfl_down_sync(0xffffffff, val, 4));
837 val = min(val, __shfl_down_sync(0xffffffff, val, 2));
838 val = min(val, __shfl_down_sync(0xffffffff, val, 1));
839 return val;
840}
841
845template< typename T >
846__global__ void reduce_kernel(T * bufred, const int n) {
847
848 T sum = 0;
849 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
850 const int str = blockDim.x * gridDim.x;
851 for (int i = idx; i<n ; i += str)
852 {
853 sum += bufred[i];
854 }
855
856 __shared__ T shared[32];
857 unsigned int lane = threadIdx.x % warpSize;
858 unsigned int wid = threadIdx.x / warpSize;
859
861 if (lane == 0)
862 shared[wid] = sum;
864
865 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
866 if (wid == 0)
868
869 if (threadIdx.x == 0)
870 bufred[blockIdx.x] = sum;
871}
872
876template< typename T >
877__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
878
879 T max_val = ninf;
880 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
881 const int str = blockDim.x * gridDim.x;
882 for (int i = idx; i<n ; i += str)
883 {
885 }
886
887 __shared__ T shared[32];
888 unsigned int lane = threadIdx.x % warpSize;
889 unsigned int wid = threadIdx.x / warpSize;
890
892 if (lane == 0)
893 shared[wid] = max_val;
895
897 if (wid == 0)
899
900 if (threadIdx.x == 0)
902}
903
907template< typename T >
908__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
909
910 T min_val = pinf;
911 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
912 const int str = blockDim.x * gridDim.x;
913 for (int i = idx; i<n ; i += str)
914 {
916 }
917
918 __shared__ T shared[32];
919 unsigned int lane = threadIdx.x % warpSize;
920 unsigned int wid = threadIdx.x / warpSize;
921
923 if (lane == 0)
924 shared[wid] = min_val;
926
928 if (wid == 0)
930
931 if (threadIdx.x == 0)
933}
934
939template< typename T >
941 const int n,
942 const int j
943 ) {
944 __shared__ T buf[1024] ;
945 const int idx = threadIdx.x;
946 const int y= blockIdx.x;
947 const int step = blockDim.x;
948
949 buf[idx]=0;
950 for (int i=idx ; i<n ; i+=step)
951 {
952 buf[idx] += bufred[i*j + y];
953 }
955
956 int i = 512;
957 while (i != 0)
958 {
959 if(threadIdx.x < i && (threadIdx.x + i) < n )
960 {
961 buf[threadIdx.x] += buf[threadIdx.x + i] ;
962 }
963 i = i>>1;
965 }
966
967 bufred[y] = buf[0];
968}
969
970
974template< typename T >
976 const T * b,
977 const T * c,
978 T * buf_h,
979 const int n) {
980
981 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
982 const int str = blockDim.x * gridDim.x;
983
984 const unsigned int lane = threadIdx.x % warpSize;
985 const unsigned int wid = threadIdx.x / warpSize;
986
987 __shared__ T shared[32];
988 T sum = 0.0;
989 for (int i = idx; i < n; i+= str) {
990 sum += a[i] * b[i] * c[i];
991 }
992
994 if (lane == 0)
995 shared[wid] = sum;
997
998 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
999 if (wid == 0)
1001
1002 if (threadIdx.x == 0)
1003 buf_h[blockIdx.x] = sum;
1004}
1005
1009template< typename T >
1011 const T ** b,
1012 const T * c,
1013 T * buf_h,
1014 const int j,
1015 const int n) {
1016
1017 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
1018 const int str = blockDim.y * gridDim.x;
1019 const int y = threadIdx.x;
1020
1021 __shared__ T buf[1024];
1022 T tmp = 0;
1023 if(y < j){
1024 for (int i = idx; i < n; i+= str) {
1025 tmp += a[i] * b[threadIdx.x][i] * c[i];
1026 }
1027 }
1028
1029 buf[threadIdx.y*blockDim.x+y] = tmp;
1030 __syncthreads();
1031
1032 int i = blockDim.y>>1;
1033 while (i != 0) {
1034 if (threadIdx.y < i) {
1035 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
1036 }
1037 __syncthreads();
1038 i = i>>1;
1039 }
1040 if (threadIdx.y == 0) {
1041 if( y < j) {
1042 buf_h[j*blockIdx.x+y] = buf[y];
1043 }
1044 }
1045}
1046
1050template< typename T >
1052 const T * b,
1053 T * buf_h,
1054 const int n) {
1055
1056 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1057 const int str = blockDim.x * gridDim.x;
1058
1059 const unsigned int lane = threadIdx.x % warpSize;
1060 const unsigned int wid = threadIdx.x / warpSize;
1061
1062 __shared__ T shared[32];
1063 T sum = 0.0;
1064 for (int i = idx; i < n; i+= str) {
1065 sum += a[i] * b[i];
1066 }
1067
1069 if (lane == 0)
1070 shared[wid] = sum;
1071 __syncthreads();
1072
1073 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1074 if (wid == 0)
1076
1077 if (threadIdx.x == 0)
1078 buf_h[blockIdx.x] = sum;
1079
1080}
1081
1085template< typename T >
1087 const T * b,
1088 T * buf_h,
1089 const int n) {
1090
1091 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1092 const int str = blockDim.x * gridDim.x;
1093
1094 const unsigned int lane = threadIdx.x % warpSize;
1095 const unsigned int wid = threadIdx.x / warpSize;
1096
1097 __shared__ T shared[32];
1098 T sum = 0.0;
1099 for (int i = idx; i < n; i+= str) {
1100 sum += pow(a[i] - b[i], 2.0);
1101 }
1102
1104 if (lane == 0)
1105 shared[wid] = sum;
1106 __syncthreads();
1107
1108 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1109 if (wid == 0)
1111
1112 if (threadIdx.x == 0)
1113 buf_h[blockIdx.x] = sum;
1114
1115}
1116
1120template< typename T >
1122 T * buf_h,
1123 const int n) {
1124
1125 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1126 const int str = blockDim.x * gridDim.x;
1127
1128 const unsigned int lane = threadIdx.x % warpSize;
1129 const unsigned int wid = threadIdx.x / warpSize;
1130
1131 __shared__ T shared[32];
1132 T sum = 0;
1133 for (int i = idx; i<n ; i += str)
1134 {
1135 sum += a[i];
1136 }
1137
1139 if (lane == 0)
1140 shared[wid] = sum;
1141 __syncthreads();
1142
1143 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1144 if (wid == 0)
1146
1147 if (threadIdx.x == 0)
1148 buf_h[blockIdx.x] = sum;
1149
1150}
1151
1155template< typename T >
1157 const T ninf,
1158 T * buf_h,
1159 const int n) {
1160
1161 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1162 const int str = blockDim.x * gridDim.x;
1163
1164 const unsigned int lane = threadIdx.x % warpSize;
1165 const unsigned int wid = threadIdx.x / warpSize;
1166
1167 __shared__ T shared[32];
1168 T max_val = ninf;
1169 for (int i = idx; i<n ; i += str)
1170 {
1171 max_val = max(max_val, a[i]);
1172 }
1173
1175 if (lane == 0)
1176 shared[wid] = max_val;
1177 __syncthreads();
1178
1180 if (wid == 0)
1182
1183 if (threadIdx.x == 0)
1184 buf_h[blockIdx.x] = max_val;
1185
1186}
1187
1191template< typename T >
1193 const T pinf,
1194 T * buf_h,
1195 const int n) {
1196
1197 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1198 const int str = blockDim.x * gridDim.x;
1199
1200 const unsigned int lane = threadIdx.x % warpSize;
1201 const unsigned int wid = threadIdx.x / warpSize;
1202
1203 __shared__ T shared[32];
1204 T min_val = pinf;
1205 for (int i = idx; i<n ; i += str)
1206 {
1207 min_val = min(min_val, a[i]);
1208 }
1209
1211 if (lane == 0)
1212 shared[wid] = min_val;
1213 __syncthreads();
1214
1216 if (wid == 0)
1218
1219 if (threadIdx.x == 0)
1220 buf_h[blockIdx.x] = min_val;
1221
1222}
1223
1227template< typename T >
1229 const int n) {
1230
1231 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1232 const int str = blockDim.x * gridDim.x;
1233
1234 for (int i = idx; i < n; i += str) {
1235 a[i] = fabs(a[i]);
1236 }
1237}
1238
1239// ========================================================================== //
1240// Kernels for the point-wise operations
1241
1246template <typename T>
1247__global__ void
1248 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1249
1250 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1251 const int str = blockDim.x * gridDim.x;
1252
1253 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1254}
1255
1260template <typename T>
1262 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1263 const int n) {
1264
1265 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1266 const int str = blockDim.x * gridDim.x;
1267
1268 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1269}
1270
1275template <typename T>
1276__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1277
1278 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1279 const int str = blockDim.x * gridDim.x;
1280
1281 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1282}
1283
1288template <typename T>
1290 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1291
1292 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1293 const int str = blockDim.x * gridDim.x;
1294
1295 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1296}
1297
1302template <typename T>
1303__global__ void
1304 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1305
1306 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1307 const int str = blockDim.x * gridDim.x;
1308
1309 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1310}
1311
1316template <typename T>
1318 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1319 const int n) {
1320
1321 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1322 const int str = blockDim.x * gridDim.x;
1323
1324 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1325}
1326
1331template <typename T>
1332__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1333
1334 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1335 const int str = blockDim.x * gridDim.x;
1336
1337 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1338}
1339
1344template <typename T>
1346 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1347
1348 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1349 const int str = blockDim.x * gridDim.x;
1350
1351 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1352}
1353
1354#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cwrap_kernel(T *__restrict__ a, const T min_val, const T max_val, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_scatter_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_copy_kernel_aligned(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void face_masked_gather_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int *__restrict__ mask, const int *__restrict__ facet, const int n1, const int n2, const int lx, const int ly, const int lz, const int n_mask)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__device__ __forceinline__ void face_gather_nonlinear_index(int *index, const int idx, const int lx, const int ly, const int lz)
Definition math_kernel.h:91
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__device__ __forceinline__ int face_gather_idx(const int i, const int j, const int k, const int l, const int n1, const int n2, const int nf)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void masked_copy_kernel_0(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:679
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40