Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[i] = b[mask[i]];
87 }
88}
89
91void face_gather_nonlinear_index(int *index, const int idx, const int lx,
92 const int ly, const int lz) {
93 const int idx2 = idx - 1;
94 index[3] = idx2 / (lx * ly * lz);
95 index[2] = (idx2 - (lx * ly * lz) * index[3]) / (lx * ly);
96 index[1] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) / lx;
97 index[0] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) -
98 lx * index[1];
99 index[0]++;
100 index[1]++;
101 index[2]++;
102 index[3]++;
103}
104
106int face_gather_idx(const int i, const int j, const int k, const int l,
107 const int n1, const int n2, const int nf) {
108 return ((i) + (n1) * (((j) - 1) + (n2) * (((k) - 1) + (nf) * (((l) - 1))))) - 1;
109}
110
114template< typename T >
116 const T * __restrict__ b,
117 const int * __restrict__ mask,
118 const int * __restrict__ facet,
119 const int n1,
120 const int n2,
121 const int lx,
122 const int ly,
123 const int lz,
124 const int n_mask) {
125 int index[4];
126
127 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
128 const int str = blockDim.x * gridDim.x;
129
130 for (int m = idx; m < n_mask; m += str) {
131 const int f = facet[m + 1];
132 face_gather_nonlinear_index(index, mask[m + 1], lx, ly, lz);
133
134 switch (f) {
135 case 1:
136 case 2:
137 a[m] = b[face_gather_idx(index[1], index[2], f, index[3], n1, n2, 6)];
138 break;
139 case 3:
140 case 4:
141 a[m] = b[face_gather_idx(index[0], index[2], f, index[3], n1, n2, 6)];
142 break;
143 case 5:
144 case 6:
145 a[m] = b[face_gather_idx(index[0], index[1], f, index[3], n1, n2, 6)];
146 break;
147 }
148 }
149}
150
151
155template< typename T >
157 T * __restrict__ b,
158 int * __restrict__ mask,
159 const int n,
160 const int n_mask) {
161
162 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
163 const int str = blockDim.x * gridDim.x;
164
165 for (int i = idx; i < n_mask; i += str) {
166 a[mask[i+1]-1] = b[i];
167 }
168}
169
173template< typename T >
175 T * __restrict__ b,
176 int * __restrict__ mask,
177 const int n,
178 const int n_mask) {
179
180 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
181 const int str = blockDim.x * gridDim.x;
182
183 for (int i = idx; i < n_mask; i += str) {
184 a[mask[i]] = b[i];
185 }
186}
187
188
192template< typename T >
194 T * __restrict__ b,
195 int * __restrict__ mask,
196 const int n,
197 const int m) {
198
199 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
200 const int str = blockDim.x * gridDim.x;
201
202 for (int i = idx; i < m; i += str) {
203#if __CUDA_ARCH__ >= 600
204 atomicAdd( &(a[mask[i+1]-1]), b[i]);
205#endif
206 }
207}
208
212template< typename T >
214 T * __restrict__ b,
215 int * __restrict__ mask,
216 const int n,
217 const int n_mask) {
218
219 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
220 const int str = blockDim.x * gridDim.x;
221
222 for (int i = idx; i < n_mask; i += str) {
223 a[mask[i+1]-1] = b[mask[i+1]-1];
224 }
225}
226
230template <typename T>
232 const T c,
233 const int size,
234 int* __restrict__ mask,
235 const int mask_size) {
236
237 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
238 const int str = blockDim.x * gridDim.x;
239
240 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
241}
242
246template< typename T >
248 T * __restrict__ b,
249 const T c,
250 const int n) {
251
252 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
253 const int str = blockDim.x * gridDim.x;
254
255 for (int i = idx; i < n; i += str) {
256 a[i] = c * b[i];
257 }
258}
259
263template< typename T >
265 const T c,
266 const int n) {
267
268 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
269 const int str = blockDim.x * gridDim.x;
270
271 for (int i = idx; i < n; i += str) {
272 a[i] = c / a[i];
273 }
274}
275
279template< typename T >
281 T * __restrict__ b,
282 const T c,
283 const int n) {
284
285 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
286 const int str = blockDim.x * gridDim.x;
287
288 for (int i = idx; i < n; i += str) {
289 a[i] = c / b[i];
290 }
291}
292
296template< typename T >
298 const T c,
299 const int n) {
300
301 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
302 const int str = blockDim.x * gridDim.x;
303
304 for (int i = idx; i < n; i += str) {
305 a[i] = a[i] + c;
306 }
307}
308
312template< typename T >
314 T * __restrict__ b,
315 const T c,
316 const int n) {
317
318 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
319 const int str = blockDim.x * gridDim.x;
320
321 for (int i = idx; i < n; i += str) {
322 a[i] = b[i] + c;
323 }
324}
325
329template< typename T >
331 const T min_val,
332 const T max_val,
333 const int n) {
334
335 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
336 const int str = blockDim.x * gridDim.x;
337 const T l = max_val - min_val;
338
339 for (int i = idx; i < n; i += str) {
340 a[i] = min_val + fmod(fmod(a[i] - min_val, l) + l, l);
341 }
342}
343
347template< typename T >
349 const T c,
350 const int n) {
351
352 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
353 const int str = blockDim.x * gridDim.x;
354
355 for (int i = idx; i < n; i += str) {
356 a[i] = c;
357 }
358}
359
363template< typename T >
365 const T * __restrict__ b,
366 const int n) {
367
368 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
369 const int str = blockDim.x * gridDim.x;
370
371 for (int i = idx; i < n; i += str) {
372 a[i] = a[i] + b[i];
373 }
374}
375
379template< typename T >
381 const T * __restrict__ b,
382 const T * __restrict__ c,
383 const int n) {
384
385 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
386 const int str = blockDim.x * gridDim.x;
387
388 for (int i = idx; i < n; i += str) {
389 a[i] = b[i] + c[i];
390 }
391}
392
396template< typename T >
398 const T * __restrict__ b,
399 const T * __restrict__ c,
400 const T * __restrict__ d,
401 const int n) {
402
403 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
404 const int str = blockDim.x * gridDim.x;
405
406 for (int i = idx; i < n; i += str) {
407 a[i] = b[i] + c[i] + d[i];
408 }
409}
410
414template< typename T >
416 const T * __restrict__ b,
417 const T c1,
418 const int n) {
419
420 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
421 const int str = blockDim.x * gridDim.x;
422
423 for (int i = idx; i < n; i += str) {
424 a[i] = c1 * a[i] + b[i];
425 }
426}
427
431template< typename T >
433 const T ** p,
434 const T * alpha,
435 const int p_cur,
436 const int n) {
437
438 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
439 const int str = blockDim.x * gridDim.x;
440
441
442 for (int i = idx; i < n; i+= str) {
443 T tmp = 0.0;
444 for (int j = 0; j < p_cur; j ++) {
445 tmp += p[j][i]*alpha[j];
446 }
447 x[i] += tmp;
448 }
449}
450
454template< typename T >
456 const T * __restrict__ b,
457 const T c1,
458 const int n) {
459
460 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
461 const int str = blockDim.x * gridDim.x;
462
463 for (int i = idx; i < n; i += str) {
464 a[i] = a[i] + c1 * b[i];
465 }
466}
467
471template< typename T >
473 const T * __restrict__ b,
474 const T c1,
475 const int n) {
476
477 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
478 const int str = blockDim.x * gridDim.x;
479
480 for (int i = idx; i < n; i += str) {
481 a[i] = a[i] + c1 * (b[i] * b[i]);
482 }
483}
484
488template< typename T >
490 const T * __restrict__ b,
491 const T * __restrict__ c,
492 const T c1,
493 const T c2,
494 const int n) {
495
496 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
497 const int str = blockDim.x * gridDim.x;
498
499 for (int i = idx; i < n; i += str) {
500 a[i] = c1 * b[i] + c2 * c[i];
501 }
502}
503
507template< typename T >
509 const T * __restrict__ b,
510 const T * __restrict__ c,
511 const T * __restrict__ d,
512 const T c1,
513 const T c2,
514 const T c3,
515 const int n) {
516
517 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
518 const int str = blockDim.x * gridDim.x;
519
520 for (int i = idx; i < n; i += str) {
521 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
522 }
523}
524
528template< typename T >
530 const T * __restrict__ b,
531 const T * __restrict__ c,
532 const T * __restrict__ d,
533 const T * __restrict__ e,
534 const T c1,
535 const T c2,
536 const T c3,
537 const T c4,
538 const int n) {
539
540 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
541 const int str = blockDim.x * gridDim.x;
542
543 for (int i = idx; i < n; i += str) {
544 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
545 }
546}
547
551template< typename T >
553 const int n) {
554
555 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
556 const int str = blockDim.x * gridDim.x;
557 const T one = 1.0;
558
559 for (int i = idx; i < n; i += str) {
560 a[i] = one / a[i];
561 }
562}
563
567template< typename T >
569 const T * __restrict__ b,
570 const int n) {
571
572 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
573 const int str = blockDim.x * gridDim.x;
574
575 for (int i = idx; i < n; i += str) {
576 a[i] = a[i] / b[i];
577 }
578}
579
583template< typename T >
585 const T * __restrict__ b,
586 const T * __restrict__ c,
587 const int n) {
588
589 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
590 const int str = blockDim.x * gridDim.x;
591
592 for (int i = idx; i < n; i += str) {
593 a[i] = b[i] / c[i];
594 }
595}
596
600template< typename T >
602 const T * __restrict__ b,
603 const int n) {
604
605 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
606 const int str = blockDim.x * gridDim.x;
607
608 for (int i = idx; i < n; i += str) {
609 a[i] = a[i] * b[i];
610 }
611}
612
616template< typename T >
618 const T * __restrict__ b,
619 const T * __restrict__ c,
620 const int n) {
621
622 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
623 const int str = blockDim.x * gridDim.x;
624
625 for (int i = idx; i < n; i += str) {
626 a[i] = b[i] * c[i];
627 }
628}
629
633template< typename T >
635 const T * __restrict__ b,
636 const T * __restrict__ c,
637 const int n) {
638
639 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
640 const int str = blockDim.x * gridDim.x;
641
642 for (int i = idx; i < n; i += str) {
643 a[i] = a[i] - b[i] * c[i];
644 }
645}
646
650template< typename T >
652 const T * __restrict__ b,
653 const int n) {
654
655 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
656 const int str = blockDim.x * gridDim.x;
657
658 for (int i = idx; i < n; i += str) {
659 a[i] = a[i] - b[i];
660 }
661}
662
666template< typename T >
668 const T * __restrict__ b,
669 const T * __restrict__ c,
670 const int n) {
671
672 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
673 const int str = blockDim.x * gridDim.x;
674
675 for (int i = idx; i < n; i += str) {
676 a[i] = b[i] - c[i];
677 }
678}
679
683template< typename T >
685 const T * __restrict__ b,
686 const T * __restrict__ c,
687 const int n) {
688
689 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
690 const int str = blockDim.x * gridDim.x;
691
692 for (int i = idx; i < n; i += str) {
693 a[i] = a[i] + b[i] * c[i];
694 }
695
696}
697
701template< typename T >
703 const T * __restrict__ b,
704 const T * __restrict__ c,
705 const T * __restrict__ d,
706 const int n) {
707
708 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
709 const int str = blockDim.x * gridDim.x;
710
711 for (int i = idx; i < n; i += str) {
712 a[i] = a[i] + b[i] * c[i] * d[i];
713 }
714
715}
716
720template< typename T >
722 const T * __restrict__ b,
723 const T * __restrict__ c,
724 const T s,
725 const int n) {
726
727 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
728 const int str = blockDim.x * gridDim.x;
729
730 for (int i = idx; i < n; i += str) {
731 a[i] = a[i] + s * b[i] * c[i];
732 }
733
734}
735
739template< typename T >
741 const T * __restrict__ u1,
742 const T * __restrict__ u2,
743 const T * __restrict__ u3,
744 const T * __restrict__ v1,
745 const T * __restrict__ v2,
746 const T * __restrict__ v3,
747 const int n) {
748
749 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
750 const int str = blockDim.x * gridDim.x;
751
752 for (int i = idx; i < n; i += str) {
753 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
754 }
755
756}
757
761template< typename T >
763 T * __restrict__ u2,
764 T * __restrict__ u3,
765 const T * __restrict__ v1,
766 const T * __restrict__ v2,
767 const T * __restrict__ v3,
768 const T * __restrict__ w1,
769 const T * __restrict__ w2,
770 const T * __restrict__ w3,
771 const int n) {
772
773 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
774 const int str = blockDim.x * gridDim.x;
775
776 for (int i = idx; i < n; i += str) {
777 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
778 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
779 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
780 }
781
782}
783
784
788template< typename T>
790 val += __shfl_down_sync(0xffffffff, val, 16);
791 val += __shfl_down_sync(0xffffffff, val, 8);
792 val += __shfl_down_sync(0xffffffff, val, 4);
793 val += __shfl_down_sync(0xffffffff, val, 2);
794 val += __shfl_down_sync(0xffffffff, val, 1);
795 return val;
796}
797
801template< typename T>
803 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
804 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
805 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
806 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
807 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
808 return val;
809}
810
814template< typename T>
816 val = min(val, __shfl_down_sync(0xffffffff, val, 16));
817 val = min(val, __shfl_down_sync(0xffffffff, val, 8));
818 val = min(val, __shfl_down_sync(0xffffffff, val, 4));
819 val = min(val, __shfl_down_sync(0xffffffff, val, 2));
820 val = min(val, __shfl_down_sync(0xffffffff, val, 1));
821 return val;
822}
823
827template< typename T >
828__global__ void reduce_kernel(T * bufred, const int n) {
829
830 T sum = 0;
831 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
832 const int str = blockDim.x * gridDim.x;
833 for (int i = idx; i<n ; i += str)
834 {
835 sum += bufred[i];
836 }
837
838 __shared__ T shared[32];
839 unsigned int lane = threadIdx.x % warpSize;
840 unsigned int wid = threadIdx.x / warpSize;
841
843 if (lane == 0)
844 shared[wid] = sum;
846
847 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
848 if (wid == 0)
850
851 if (threadIdx.x == 0)
852 bufred[blockIdx.x] = sum;
853}
854
858template< typename T >
859__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
860
861 T max_val = ninf;
862 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
863 const int str = blockDim.x * gridDim.x;
864 for (int i = idx; i<n ; i += str)
865 {
867 }
868
869 __shared__ T shared[32];
870 unsigned int lane = threadIdx.x % warpSize;
871 unsigned int wid = threadIdx.x / warpSize;
872
874 if (lane == 0)
875 shared[wid] = max_val;
877
879 if (wid == 0)
881
882 if (threadIdx.x == 0)
884}
885
889template< typename T >
890__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
891
892 T min_val = pinf;
893 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
894 const int str = blockDim.x * gridDim.x;
895 for (int i = idx; i<n ; i += str)
896 {
898 }
899
900 __shared__ T shared[32];
901 unsigned int lane = threadIdx.x % warpSize;
902 unsigned int wid = threadIdx.x / warpSize;
903
905 if (lane == 0)
906 shared[wid] = min_val;
908
910 if (wid == 0)
912
913 if (threadIdx.x == 0)
915}
916
921template< typename T >
923 const int n,
924 const int j
925 ) {
926 __shared__ T buf[1024] ;
927 const int idx = threadIdx.x;
928 const int y= blockIdx.x;
929 const int step = blockDim.x;
930
931 buf[idx]=0;
932 for (int i=idx ; i<n ; i+=step)
933 {
934 buf[idx] += bufred[i*j + y];
935 }
937
938 int i = 512;
939 while (i != 0)
940 {
941 if(threadIdx.x < i && (threadIdx.x + i) < n )
942 {
943 buf[threadIdx.x] += buf[threadIdx.x + i] ;
944 }
945 i = i>>1;
947 }
948
949 bufred[y] = buf[0];
950}
951
952
956template< typename T >
958 const T * b,
959 const T * c,
960 T * buf_h,
961 const int n) {
962
963 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
964 const int str = blockDim.x * gridDim.x;
965
966 const unsigned int lane = threadIdx.x % warpSize;
967 const unsigned int wid = threadIdx.x / warpSize;
968
969 __shared__ T shared[32];
970 T sum = 0.0;
971 for (int i = idx; i < n; i+= str) {
972 sum += a[i] * b[i] * c[i];
973 }
974
976 if (lane == 0)
977 shared[wid] = sum;
979
980 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
981 if (wid == 0)
983
984 if (threadIdx.x == 0)
985 buf_h[blockIdx.x] = sum;
986}
987
991template< typename T >
993 const T ** b,
994 const T * c,
995 T * buf_h,
996 const int j,
997 const int n) {
998
999 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
1000 const int str = blockDim.y * gridDim.x;
1001 const int y = threadIdx.x;
1002
1003 __shared__ T buf[1024];
1004 T tmp = 0;
1005 if(y < j){
1006 for (int i = idx; i < n; i+= str) {
1007 tmp += a[i] * b[threadIdx.x][i] * c[i];
1008 }
1009 }
1010
1011 buf[threadIdx.y*blockDim.x+y] = tmp;
1012 __syncthreads();
1013
1014 int i = blockDim.y>>1;
1015 while (i != 0) {
1016 if (threadIdx.y < i) {
1017 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
1018 }
1019 __syncthreads();
1020 i = i>>1;
1021 }
1022 if (threadIdx.y == 0) {
1023 if( y < j) {
1024 buf_h[j*blockIdx.x+y] = buf[y];
1025 }
1026 }
1027}
1028
1032template< typename T >
1034 const T * b,
1035 T * buf_h,
1036 const int n) {
1037
1038 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1039 const int str = blockDim.x * gridDim.x;
1040
1041 const unsigned int lane = threadIdx.x % warpSize;
1042 const unsigned int wid = threadIdx.x / warpSize;
1043
1044 __shared__ T shared[32];
1045 T sum = 0.0;
1046 for (int i = idx; i < n; i+= str) {
1047 sum += a[i] * b[i];
1048 }
1049
1051 if (lane == 0)
1052 shared[wid] = sum;
1053 __syncthreads();
1054
1055 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1056 if (wid == 0)
1058
1059 if (threadIdx.x == 0)
1060 buf_h[blockIdx.x] = sum;
1061
1062}
1063
1067template< typename T >
1069 const T * b,
1070 T * buf_h,
1071 const int n) {
1072
1073 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1074 const int str = blockDim.x * gridDim.x;
1075
1076 const unsigned int lane = threadIdx.x % warpSize;
1077 const unsigned int wid = threadIdx.x / warpSize;
1078
1079 __shared__ T shared[32];
1080 T sum = 0.0;
1081 for (int i = idx; i < n; i+= str) {
1082 sum += pow(a[i] - b[i], 2.0);
1083 }
1084
1086 if (lane == 0)
1087 shared[wid] = sum;
1088 __syncthreads();
1089
1090 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1091 if (wid == 0)
1093
1094 if (threadIdx.x == 0)
1095 buf_h[blockIdx.x] = sum;
1096
1097}
1098
1102template< typename T >
1104 T * buf_h,
1105 const int n) {
1106
1107 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1108 const int str = blockDim.x * gridDim.x;
1109
1110 const unsigned int lane = threadIdx.x % warpSize;
1111 const unsigned int wid = threadIdx.x / warpSize;
1112
1113 __shared__ T shared[32];
1114 T sum = 0;
1115 for (int i = idx; i<n ; i += str)
1116 {
1117 sum += a[i];
1118 }
1119
1121 if (lane == 0)
1122 shared[wid] = sum;
1123 __syncthreads();
1124
1125 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1126 if (wid == 0)
1128
1129 if (threadIdx.x == 0)
1130 buf_h[blockIdx.x] = sum;
1131
1132}
1133
1137template< typename T >
1139 const T ninf,
1140 T * buf_h,
1141 const int n) {
1142
1143 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1144 const int str = blockDim.x * gridDim.x;
1145
1146 const unsigned int lane = threadIdx.x % warpSize;
1147 const unsigned int wid = threadIdx.x / warpSize;
1148
1149 __shared__ T shared[32];
1150 T max_val = ninf;
1151 for (int i = idx; i<n ; i += str)
1152 {
1153 max_val = max(max_val, a[i]);
1154 }
1155
1157 if (lane == 0)
1158 shared[wid] = max_val;
1159 __syncthreads();
1160
1162 if (wid == 0)
1164
1165 if (threadIdx.x == 0)
1166 buf_h[blockIdx.x] = max_val;
1167
1168}
1169
1173template< typename T >
1175 const T pinf,
1176 T * buf_h,
1177 const int n) {
1178
1179 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1180 const int str = blockDim.x * gridDim.x;
1181
1182 const unsigned int lane = threadIdx.x % warpSize;
1183 const unsigned int wid = threadIdx.x / warpSize;
1184
1185 __shared__ T shared[32];
1186 T min_val = pinf;
1187 for (int i = idx; i<n ; i += str)
1188 {
1189 min_val = min(min_val, a[i]);
1190 }
1191
1193 if (lane == 0)
1194 shared[wid] = min_val;
1195 __syncthreads();
1196
1198 if (wid == 0)
1200
1201 if (threadIdx.x == 0)
1202 buf_h[blockIdx.x] = min_val;
1203
1204}
1205
1209template< typename T >
1211 const int n) {
1212
1213 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1214 const int str = blockDim.x * gridDim.x;
1215
1216 for (int i = idx; i < n; i += str) {
1217 a[i] = fabs(a[i]);
1218 }
1219}
1220
1221// ========================================================================== //
1222// Kernels for the point-wise operations
1223
1228template <typename T>
1229__global__ void
1230 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1231
1232 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1233 const int str = blockDim.x * gridDim.x;
1234
1235 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1236}
1237
1242template <typename T>
1244 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1245 const int n) {
1246
1247 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1248 const int str = blockDim.x * gridDim.x;
1249
1250 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1251}
1252
1257template <typename T>
1258__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1259
1260 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1261 const int str = blockDim.x * gridDim.x;
1262
1263 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1264}
1265
1270template <typename T>
1272 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1273
1274 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1275 const int str = blockDim.x * gridDim.x;
1276
1277 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1278}
1279
1284template <typename T>
1285__global__ void
1286 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1287
1288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1289 const int str = blockDim.x * gridDim.x;
1290
1291 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1292}
1293
1298template <typename T>
1300 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1301 const int n) {
1302
1303 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1304 const int str = blockDim.x * gridDim.x;
1305
1306 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1307}
1308
1313template <typename T>
1314__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1315
1316 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1317 const int str = blockDim.x * gridDim.x;
1318
1319 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1320}
1321
1326template <typename T>
1328 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1329
1330 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1331 const int str = blockDim.x * gridDim.x;
1332
1333 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1334}
1335
1336#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cwrap_kernel(T *__restrict__ a, const T min_val, const T max_val, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_scatter_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void face_masked_gather_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int *__restrict__ mask, const int *__restrict__ facet, const int n1, const int n2, const int lx, const int ly, const int lz, const int n_mask)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__device__ __forceinline__ void face_gather_nonlinear_index(int *index, const int idx, const int lx, const int ly, const int lz)
Definition math_kernel.h:91
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__device__ __forceinline__ int face_gather_idx(const int i, const int j, const int k, const int l, const int n1, const int n2, const int nf)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:662
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40