Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[i] = b[mask[i]];
86 }
87}
88
90void face_gather_nonlinear_index(int *index, const int idx, const int lx,
91 const int ly, const int lz) {
92 const int idx2 = idx - 1;
93 index[3] = idx2 / (lx * ly * lz);
94 index[2] = (idx2 - (lx * ly * lz) * index[3]) / (lx * ly);
95 index[1] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) / lx;
96 index[0] = (idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) -
97 lx * index[1];
98 index[0]++;
99 index[1]++;
100 index[2]++;
101 index[3]++;
102}
103
105int face_gather_idx(const int i, const int j, const int k, const int l,
106 const int n1, const int n2, const int nf) {
107 return ((i) + (n1) * (((j) - 1) + (n2) * (((k) - 1) + (nf) * (((l) - 1))))) - 1;
108}
109
113template< typename T >
115 const T * __restrict__ b,
116 const int * __restrict__ mask,
117 const int * __restrict__ facet,
118 const int n1,
119 const int n2,
120 const int lx,
121 const int ly,
122 const int lz,
123 const int n_mask) {
124 int index[4];
125
126 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
127 const int str = blockDim.x * gridDim.x;
128
129 for (int m = idx; m < n_mask; m += str) {
130 const int f = facet[m + 1];
131 face_gather_nonlinear_index(index, mask[m + 1], lx, ly, lz);
132
133 switch (f) {
134 case 1:
135 case 2:
136 a[m] = b[face_gather_idx(index[1], index[2], f, index[3], n1, n2, 6)];
137 break;
138 case 3:
139 case 4:
140 a[m] = b[face_gather_idx(index[0], index[2], f, index[3], n1, n2, 6)];
141 break;
142 case 5:
143 case 6:
144 a[m] = b[face_gather_idx(index[0], index[1], f, index[3], n1, n2, 6)];
145 break;
146 }
147 }
148}
149
153template< typename T >
155 T * __restrict__ b,
156 int * __restrict__ mask,
157 const int n,
158 const int n_mask) {
159
160 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
161 const int str = blockDim.x * gridDim.x;
162
163 for (int i = idx; i < n_mask; i += str) {
164 a[mask[i+1]-1] = b[i];
165 }
166}
167
171template< typename T >
173 T * __restrict__ b,
174 int * __restrict__ mask,
175 const int n,
176 const int n_mask) {
177
178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
179 const int str = blockDim.x * gridDim.x;
180
181 for (int i = idx; i < n_mask; i += str) {
182 a[mask[i]] = b[i];
183 }
184}
185
189template< typename T >
191 T * __restrict__ b,
192 int * __restrict__ mask,
193 const int n,
194 const int n_mask) {
195
196 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
197 const int str = blockDim.x * gridDim.x;
198
199 for (int i = idx; i < n_mask; i += str) {
200 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
201 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
202 }
203}
204
208template< typename T >
210 T * __restrict__ b,
211 int * __restrict__ mask,
212 const int n,
213 const int n_mask) {
214
215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216 const int str = blockDim.x * gridDim.x;
217
218 for (int i = idx; i < n_mask; i += str) {
219 a[mask[i+1]-1] = b[mask[i+1]-1];
220 }
221}
222
226template< typename T >
228 T * __restrict__ b,
229 int * __restrict__ mask,
230 const int n,
231 const int n_mask) {
232
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
235
236 for (int i = idx; i < n_mask; i += str) {
237 a[mask[i]] = b[mask[i]];
238 }
239}
240
244template< typename T >
246 const T c,
247 const int n,
248 int* __restrict__ mask,
249 const int n_mask) {
250
251 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
252 const int str = blockDim.x * gridDim.x;
253
254 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
255}
256
260template< typename T >
262 T * __restrict__ b,
263 const T c,
264 const int n) {
265
266 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
267 const int str = blockDim.x * gridDim.x;
268
269 for (int i = idx; i < n; i += str) {
270 a[i] = c * b[i];
271 }
272}
273
277template< typename T >
279 const T c,
280 const int n) {
281
282 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
283 const int str = blockDim.x * gridDim.x;
284
285 for (int i = idx; i < n; i += str) {
286 a[i] = c / a[i];
287 }
288}
289
293template< typename T >
295 T * __restrict__ b,
296 const T c,
297 const int n) {
298
299 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
300 const int str = blockDim.x * gridDim.x;
301
302 for (int i = idx; i < n; i += str) {
303 a[i] = c / b[i];
304 }
305}
306
310template< typename T >
312 const T c,
313 const int n) {
314
315 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
316 const int str = blockDim.x * gridDim.x;
317
318 for (int i = idx; i < n; i += str) {
319 a[i] = a[i] + c;
320 }
321}
322
326template< typename T >
328 T * __restrict__ b,
329 const T c,
330 const int n) {
331
332 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
333 const int str = blockDim.x * gridDim.x;
334
335 for (int i = idx; i < n; i += str) {
336 a[i] = b[i] + c;
337 }
338}
339
343template< typename T >
345 const T min_val,
346 const T max_val,
347 const int n) {
348
349 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
350 const int str = blockDim.x * gridDim.x;
351 const T l = max_val - min_val;
352
353 for (int i = idx; i < n; i += str) {
354 a[i] = min_val + fmod(fmod(a[i] - min_val, l) + l, l);
355 }
356}
357
361template< typename T >
363 const T c,
364 const int n) {
365
366 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
367 const int str = blockDim.x * gridDim.x;
368
369 for (int i = idx; i < n; i += str) {
370 a[i] = c;
371 }
372}
373
377template< typename T >
379 const T * __restrict__ b,
380 const int n) {
381
382 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
383 const int str = blockDim.x * gridDim.x;
384
385 for (int i = idx; i < n; i += str) {
386 a[i] = a[i] + b[i];
387 }
388}
389
393template< typename T >
395 const T * __restrict__ b,
396 const T * __restrict__ c,
397 const int n) {
398
399 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
400 const int str = blockDim.x * gridDim.x;
401
402 for (int i = idx; i < n; i += str) {
403 a[i] = b[i] + c[i];
404 }
405}
406
410template< typename T >
412 const T * __restrict__ b,
413 const T * __restrict__ c,
414 const T * __restrict__ d,
415 const int n) {
416
417 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
418 const int str = blockDim.x * gridDim.x;
419
420 for (int i = idx; i < n; i += str) {
421 a[i] = b[i] + c[i] + d[i];
422 }
423}
424
428template< typename T >
430 const T * __restrict__ b,
431 const T c1,
432 const int n) {
433
434 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
435 const int str = blockDim.x * gridDim.x;
436
437 for (int i = idx; i < n; i += str) {
438 a[i] = c1 * a[i] + b[i];
439 }
440}
441
445template< typename T >
447 const T ** p,
448 const T * alpha,
449 const int p_cur,
450 const int n) {
451
452 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
453 const int str = blockDim.x * gridDim.x;
454
455
456 for (int i = idx; i < n; i+= str) {
457 T tmp = 0.0;
458 for (int j = 0; j < p_cur; j ++) {
459 tmp += p[j][i]*alpha[j];
460 }
461 x[i] += tmp;
462 }
463}
464
468template< typename T >
470 const T * __restrict__ b,
471 const T c1,
472 const int n) {
473
474 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
475 const int str = blockDim.x * gridDim.x;
476
477 for (int i = idx; i < n; i += str) {
478 a[i] = a[i] + c1 * b[i];
479 }
480}
481
485template< typename T >
487 const T * __restrict__ b,
488 const T c1,
489 const int n) {
490
491 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
492 const int str = blockDim.x * gridDim.x;
493
494 for (int i = idx; i < n; i += str) {
495 a[i] = a[i] + c1 * (b[i] * b[i]);
496 }
497}
498
502template< typename T >
504 const T * __restrict__ b,
505 const T * __restrict__ c,
506 const T c1,
507 const T c2,
508 const int n) {
509
510 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
511 const int str = blockDim.x * gridDim.x;
512
513 for (int i = idx; i < n; i += str) {
514 a[i] = c1 * b[i] + c2 * c[i];
515 }
516}
517
521template< typename T >
523 const T * __restrict__ b,
524 const T * __restrict__ c,
525 const T * __restrict__ d,
526 const T c1,
527 const T c2,
528 const T c3,
529 const int n) {
530
531 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
532 const int str = blockDim.x * gridDim.x;
533
534 for (int i = idx; i < n; i += str) {
535 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
536 }
537}
538
542template< typename T >
544 const T * __restrict__ b,
545 const T * __restrict__ c,
546 const T * __restrict__ d,
547 const T * __restrict__ e,
548 const T c1,
549 const T c2,
550 const T c3,
551 const T c4,
552 const int n) {
553
554 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
555 const int str = blockDim.x * gridDim.x;
556
557 for (int i = idx; i < n; i += str) {
558 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
559 }
560}
561
565template< typename T >
567 const int n) {
568
569 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
570 const int str = blockDim.x * gridDim.x;
571 const T one = 1.0;
572
573 for (int i = idx; i < n; i += str) {
574 a[i] = one / a[i];
575 }
576}
577
581template< typename T >
583 const T * __restrict__ b,
584 const int n) {
585
586 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
587 const int str = blockDim.x * gridDim.x;
588
589 for (int i = idx; i < n; i += str) {
590 a[i] = a[i] / b[i];
591 }
592}
593
597template< typename T >
599 const T * __restrict__ b,
600 const T * __restrict__ c,
601 const int n) {
602
603 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
604 const int str = blockDim.x * gridDim.x;
605
606 for (int i = idx; i < n; i += str) {
607 a[i] = b[i] / c[i];
608 }
609}
610
614template< typename T >
616 const T * __restrict__ b,
617 const int n) {
618
619 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
620 const int str = blockDim.x * gridDim.x;
621
622 for (int i = idx; i < n; i += str) {
623 a[i] = a[i] * b[i];
624 }
625}
626
630template< typename T >
632 const T * __restrict__ b,
633 const T * __restrict__ c,
634 const int n) {
635
636 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
637 const int str = blockDim.x * gridDim.x;
638
639 for (int i = idx; i < n; i += str) {
640 a[i] = b[i] * c[i];
641 }
642}
643
647template< typename T >
649 const T * __restrict__ b,
650 const T * __restrict__ c,
651 const int n) {
652
653 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
654 const int str = blockDim.x * gridDim.x;
655
656 for (int i = idx; i < n; i += str) {
657 a[i] = a[i] - b[i] * c[i];
658 }
659}
660
664template< typename T >
666 const T * __restrict__ b,
667 const int n) {
668
669 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
670 const int str = blockDim.x * gridDim.x;
671
672 for (int i = idx; i < n; i += str) {
673 a[i] = a[i] - b[i];
674 }
675}
676
680template< typename T >
682 const T * __restrict__ b,
683 const T * __restrict__ c,
684 const int n) {
685
686 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
687 const int str = blockDim.x * gridDim.x;
688
689 for (int i = idx; i < n; i += str) {
690 a[i] = b[i] - c[i];
691 }
692}
693
697template< typename T >
699 const T * __restrict__ b,
700 const T * __restrict__ c,
701 const int n) {
702
703 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
704 const int str = blockDim.x * gridDim.x;
705
706 for (int i = idx; i < n; i += str) {
707 a[i] = a[i] + b[i] * c[i];
708 }
709
710}
711
715template< typename T >
717 const T * __restrict__ b,
718 const T * __restrict__ c,
719 const T * __restrict__ d,
720 const int n) {
721
722 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
723 const int str = blockDim.x * gridDim.x;
724
725 for (int i = idx; i < n; i += str) {
726 a[i] = a[i] + b[i] * c[i] * d[i];
727 }
728
729}
730
734template< typename T >
736 const T * __restrict__ b,
737 const T * __restrict__ c,
738 const T s,
739 const int n) {
740
741 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
742 const int str = blockDim.x * gridDim.x;
743
744 for (int i = idx; i < n; i += str) {
745 a[i] = a[i] + s * b[i] * c[i];
746 }
747
748}
749
753template< typename T >
755 const T * __restrict__ u1,
756 const T * __restrict__ u2,
757 const T * __restrict__ u3,
758 const T * __restrict__ v1,
759 const T * __restrict__ v2,
760 const T * __restrict__ v3,
761 const int n) {
762
763 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
764 const int str = blockDim.x * gridDim.x;
765
766 for (int i = idx; i < n; i += str) {
767 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
768 }
769
770}
771
775template< typename T >
777 T * __restrict__ u2,
778 T * __restrict__ u3,
779 const T * __restrict__ v1,
780 const T * __restrict__ v2,
781 const T * __restrict__ v3,
782 const T * __restrict__ w1,
783 const T * __restrict__ w2,
784 const T * __restrict__ w3,
785 const int n) {
786
787 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
788 const int str = blockDim.x * gridDim.x;
789
790 for (int i = idx; i < n; i += str) {
791 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
792 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
793 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
794 }
795}
796
800template< typename T>
802 val += __shfl_down(val, 32);
803 val += __shfl_down(val, 16);
804 val += __shfl_down(val, 8);
805 val += __shfl_down(val, 4);
806 val += __shfl_down(val, 2);
807 val += __shfl_down(val, 1);
808 return val;
809}
810
814template< typename T>
816 val = max(val, __shfl_down(val, 32));
817 val = max(val, __shfl_down(val, 16));
818 val = max(val, __shfl_down(val, 8));
819 val = max(val, __shfl_down(val, 4));
820 val = max(val, __shfl_down(val, 2));
821 val = max(val, __shfl_down(val, 1));
822 return val;
823}
824
828template< typename T>
830 val = min(val, __shfl_down(val, 32));
831 val = min(val, __shfl_down(val, 16));
832 val = min(val, __shfl_down(val, 8));
833 val = min(val, __shfl_down(val, 4));
834 val = min(val, __shfl_down(val, 2));
835 val = min(val, __shfl_down(val, 1));
836 return val;
837}
838
842template< typename T >
843__global__ void reduce_kernel(T * bufred, const int n) {
844
845 T sum = 0;
846 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
847 const int str = blockDim.x * gridDim.x;
848 for (int i = idx; i<n ; i += str)
849 {
850 sum += bufred[i];
851 }
852
853 __shared__ T shared[64];
854 unsigned int lane = threadIdx.x % warpSize;
855 unsigned int wid = threadIdx.x / warpSize;
856
858 if (lane == 0)
859 shared[wid] = sum;
861
862 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
863 if (wid == 0)
865
866 if (threadIdx.x == 0)
867 bufred[blockIdx.x] = sum;
868}
869
873template< typename T >
874__global__ void reduce_max_kernel(T * bufred, const T ninf, const int n) {
875
876 T max_val = ninf;
877 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
878 const int str = blockDim.x * gridDim.x;
879 for (int i = idx; i<n ; i += str)
880 {
882 }
883
884 __shared__ T shared[64];
885 unsigned int lane = threadIdx.x % warpSize;
886 unsigned int wid = threadIdx.x / warpSize;
887
889 if (lane == 0)
890 shared[wid] = max_val;
892
894 if (wid == 0)
896
897 if (threadIdx.x == 0)
899}
900
904template< typename T >
905__global__ void reduce_min_kernel(T * bufred, const T pinf, const int n) {
906
907 T min_val = pinf;
908 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
909 const int str = blockDim.x * gridDim.x;
910 for (int i = idx; i<n ; i += str)
911 {
913 }
914
915 __shared__ T shared[64];
916 unsigned int lane = threadIdx.x % warpSize;
917 unsigned int wid = threadIdx.x / warpSize;
918
920 if (lane == 0)
921 shared[wid] = min_val;
923
925 if (wid == 0)
927
928 if (threadIdx.x == 0)
930}
931
936template< typename T >
938 const int n,
939 const int j
940 ) {
941 __shared__ T buf[1024] ;
942 const int idx = threadIdx.x;
943 const int y= blockIdx.x;
944 const int step = blockDim.x;
945
946 buf[idx]=0;
947 for (int i=idx ; i<n ; i+=step)
948 {
949 buf[idx] += bufred[i*j + y];
950 }
952
953 int i = 512;
954 while (i != 0)
955 {
956 if(threadIdx.x < i && (threadIdx.x + i) < n )
957 {
958 buf[threadIdx.x] += buf[threadIdx.x + i] ;
959 }
960 i = i>>1;
962 }
963
964 bufred[y] = buf[0];
965}
966
970template< typename T >
972 const T * b,
973 const T * c,
974 T * buf_h,
975 const int n) {
976
977 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
978 const int str = blockDim.x * gridDim.x;
979
980 const unsigned int lane = threadIdx.x % warpSize;
981 const unsigned int wid = threadIdx.x / warpSize;
982
983 __shared__ T shared[64];
984 T sum = 0.0;
985 for (int i = idx; i < n; i+= str) {
986 sum += a[i] * b[i] * c[i];
987 }
988
990 if (lane == 0)
991 shared[wid] = sum;
993
994 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
995 if (wid == 0)
997
998 if (threadIdx.x == 0)
999 buf_h[blockIdx.x] = sum;
1000}
1001
1005template< typename T >
1007 const T ** b,
1008 const T * c,
1009 T * buf_h,
1010 const int j,
1011 const int n) {
1012
1013 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
1014 const int str = blockDim.y * gridDim.x;
1015 const int y = threadIdx.x;
1016
1017 __shared__ T buf[1024];
1018 T tmp = 0;
1019 if(y < j){
1020 for (int i = idx; i < n; i+= str) {
1021 tmp += a[i] * b[threadIdx.x][i] * c[i];
1022 }
1023 }
1024
1025 buf[threadIdx.y*blockDim.x+y] = tmp;
1026 __syncthreads();
1027
1028 int i = blockDim.y>>1;
1029 while (i != 0) {
1030 if (threadIdx.y < i) {
1031 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
1032 }
1033 __syncthreads();
1034 i = i>>1;
1035 }
1036 if (threadIdx.y == 0) {
1037 if( y < j) {
1038 buf_h[j*blockIdx.x+y] = buf[y];
1039 }
1040 }
1041}
1042
1046template< typename T >
1048 const T * b,
1049 T * buf_h,
1050 const int n) {
1051
1052 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1053 const int str = blockDim.x * gridDim.x;
1054
1055 const unsigned int lane = threadIdx.x % warpSize;
1056 const unsigned int wid = threadIdx.x / warpSize;
1057
1058 __shared__ T shared[64];
1059 T sum = 0.0;
1060 for (int i = idx; i < n; i+= str) {
1061 sum += a[i] * b[i];
1062 }
1063
1065 if (lane == 0)
1066 shared[wid] = sum;
1067 __syncthreads();
1068
1069 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1070 if (wid == 0)
1072
1073 if (threadIdx.x == 0)
1074 buf_h[blockIdx.x] = sum;
1075
1076}
1077
1081template< typename T >
1083 const T * b,
1084 T * buf_h,
1085 const int n) {
1086
1087 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1088 const int str = blockDim.x * gridDim.x;
1089
1090 const unsigned int lane = threadIdx.x % warpSize;
1091 const unsigned int wid = threadIdx.x / warpSize;
1092
1093 __shared__ T shared[64];
1094 T sum = 0.0;
1095 for (int i = idx; i < n; i+= str) {
1096 sum += pow(a[i] - b[i], 2.0);
1097 }
1098
1100 if (lane == 0)
1101 shared[wid] = sum;
1102 __syncthreads();
1103
1104 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1105 if (wid == 0)
1107
1108 if (threadIdx.x == 0)
1109 buf_h[blockIdx.x] = sum;
1110
1111}
1112
1116template< typename T >
1118 T * buf_h,
1119 const int n) {
1120
1121 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1122 const int str = blockDim.x * gridDim.x;
1123
1124 const unsigned int lane = threadIdx.x % warpSize;
1125 const unsigned int wid = threadIdx.x / warpSize;
1126
1127 __shared__ T shared[64];
1128 T sum = 0;
1129 for (int i = idx; i<n ; i += str)
1130 {
1131 sum += a[i];
1132 }
1133
1135 if (lane == 0)
1136 shared[wid] = sum;
1137 __syncthreads();
1138
1139 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1140 if (wid == 0)
1142
1143 if (threadIdx.x == 0)
1144 buf_h[blockIdx.x] = sum;
1145
1146}
1147
1151template< typename T >
1153 const T ninf,
1154 T * buf_h,
1155 const int n) {
1156
1157 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1158 const int str = blockDim.x * gridDim.x;
1159
1160 const unsigned int lane = threadIdx.x % warpSize;
1161 const unsigned int wid = threadIdx.x / warpSize;
1162
1163 __shared__ T shared[64];
1164 T max_val = ninf;
1165 for (int i = idx; i<n ; i += str)
1166 {
1167 max_val = max(max_val, a[i]);
1168 }
1169
1171 if (lane == 0)
1172 shared[wid] = max_val;
1173 __syncthreads();
1174
1176 if (wid == 0)
1178
1179 if (threadIdx.x == 0)
1180 buf_h[blockIdx.x] = max_val;
1181
1182}
1183
1187template< typename T >
1189 const T pinf,
1190 T * buf_h,
1191 const int n) {
1192
1193 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1194 const int str = blockDim.x * gridDim.x;
1195
1196 const unsigned int lane = threadIdx.x % warpSize;
1197 const unsigned int wid = threadIdx.x / warpSize;
1198
1199 __shared__ T shared[64];
1200 T min_val = pinf;
1201 for (int i = idx; i<n ; i += str)
1202 {
1203 min_val = min(min_val, a[i]);
1204 }
1205
1207 if (lane == 0)
1208 shared[wid] = min_val;
1209 __syncthreads();
1210
1212 if (wid == 0)
1214
1215 if (threadIdx.x == 0)
1216 buf_h[blockIdx.x] = min_val;
1217
1218}
1219
1223template< typename T >
1225 const int n) {
1226
1227 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1228 const int str = blockDim.x * gridDim.x;
1229
1230 for (int i = idx; i < n; i += str) {
1231 a[i] = fabs(a[i]);
1232 }
1233}
1234
1235// ========================================================================== //
1236// Kernels for the point-wise operations
1237
1242template <typename T>
1243__global__ void
1244 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1245
1246 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1247 const int str = blockDim.x * gridDim.x;
1248
1249 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
1250}
1251
1256template <typename T>
1258 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1259 const int n) {
1260
1261 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1262 const int str = blockDim.x * gridDim.x;
1263
1264 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
1265}
1266
1271template <typename T>
1272__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1273
1274 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1275 const int str = blockDim.x * gridDim.x;
1276
1277 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1278}
1279
1284template <typename T>
1286 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1287
1288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1289 const int str = blockDim.x * gridDim.x;
1290
1291 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1292}
1293
1298template <typename T>
1299__global__ void
1300 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1301
1302 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1303 const int str = blockDim.x * gridDim.x;
1304
1305 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1306}
1307
1312template <typename T>
1314 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1315 const int n) {
1316
1317 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1318 const int str = blockDim.x * gridDim.x;
1319
1320 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1321}
1322
1327template <typename T>
1328__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1329
1330 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1331 const int str = blockDim.x * gridDim.x;
1332
1333 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1334}
1335
1340template <typename T>
1342 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1343
1344 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1345 const int str = blockDim.x * gridDim.x;
1346
1347 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1348}
1349
1350#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cwrap_kernel(T *__restrict__ a, const T min_val, const T max_val, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_scatter_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_copy_kernel_aligned(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void face_masked_gather_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int *__restrict__ mask, const int *__restrict__ facet, const int n1, const int n2, const int lx, const int ly, const int lz, const int n_mask)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__device__ __forceinline__ void face_gather_nonlinear_index(int *index, const int idx, const int lx, const int ly, const int lz)
Definition math_kernel.h:91
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__device__ __forceinline__ int face_gather_idx(const int i, const int j, const int k, const int l, const int n1, const int n2, const int nf)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void masked_copy_kernel_0(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
real * bufred
Definition math.cu:679
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40