Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[i] = b[mask[i]];
86 }
87}
88
92template< typename T >
94 T * __restrict__ b,
95 int * __restrict__ mask,
96 const int n,
97 const int n_mask) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < n_mask; i += str) {
103 a[mask[i+1]-1] = b[i];
104 }
105}
106
110template< typename T >
112 T * __restrict__ b,
113 int * __restrict__ mask,
114 const int n,
115 const int n_mask) {
116
117 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
118 const int str = blockDim.x * gridDim.x;
119
120 for (int i = idx; i < n_mask; i += str) {
121 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
122 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
123 }
124}
125
129template< typename T >
131 T * __restrict__ b,
132 int * __restrict__ mask,
133 const int n,
134 const int n_mask) {
135
136 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
137 const int str = blockDim.x * gridDim.x;
138
139 for (int i = idx; i < n_mask; i += str) {
140 a[mask[i+1]-1] = b[mask[i+1]-1];
141 }
142}
143
147template< typename T >
149 const T c,
150 const int n,
151 int* __restrict__ mask,
152 const int n_mask) {
153
154 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
155 const int str = blockDim.x * gridDim.x;
156
157 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
158}
159
163template< typename T >
165 T * __restrict__ b,
166 const T c,
167 const int n) {
168
169 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
170 const int str = blockDim.x * gridDim.x;
171
172 for (int i = idx; i < n; i += str) {
173 a[i] = c * b[i];
174 }
175}
176
180template< typename T >
182 const T c,
183 const int n) {
184
185 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
186 const int str = blockDim.x * gridDim.x;
187
188 for (int i = idx; i < n; i += str) {
189 a[i] = c / a[i];
190 }
191}
192
196template< typename T >
198 T * __restrict__ b,
199 const T c,
200 const int n) {
201
202 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
203 const int str = blockDim.x * gridDim.x;
204
205 for (int i = idx; i < n; i += str) {
206 a[i] = c / b[i];
207 }
208}
209
213template< typename T >
215 const T c,
216 const int n) {
217
218 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
219 const int str = blockDim.x * gridDim.x;
220
221 for (int i = idx; i < n; i += str) {
222 a[i] = a[i] + c;
223 }
224}
225
229template< typename T >
231 T * __restrict__ b,
232 const T c,
233 const int n) {
234
235 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
236 const int str = blockDim.x * gridDim.x;
237
238 for (int i = idx; i < n; i += str) {
239 a[i] = b[i] + c;
240 }
241}
242
246template< typename T >
248 const T c,
249 const int n) {
250
251 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
252 const int str = blockDim.x * gridDim.x;
253
254 for (int i = idx; i < n; i += str) {
255 a[i] = c;
256 }
257}
258
262template< typename T >
264 const T * __restrict__ b,
265 const int n) {
266
267 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
268 const int str = blockDim.x * gridDim.x;
269
270 for (int i = idx; i < n; i += str) {
271 a[i] = a[i] + b[i];
272 }
273}
274
278template< typename T >
280 const T * __restrict__ b,
281 const T * __restrict__ c,
282 const int n) {
283
284 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
285 const int str = blockDim.x * gridDim.x;
286
287 for (int i = idx; i < n; i += str) {
288 a[i] = b[i] + c[i];
289 }
290}
291
295template< typename T >
297 const T * __restrict__ b,
298 const T * __restrict__ c,
299 const T * __restrict__ d,
300 const int n) {
301
302 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
303 const int str = blockDim.x * gridDim.x;
304
305 for (int i = idx; i < n; i += str) {
306 a[i] = b[i] + c[i] + d[i];
307 }
308}
309
313template< typename T >
315 const T * __restrict__ b,
316 const T c1,
317 const int n) {
318
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
321
322 for (int i = idx; i < n; i += str) {
323 a[i] = c1 * a[i] + b[i];
324 }
325}
326
330template< typename T >
332 const T ** p,
333 const T * alpha,
334 const int p_cur,
335 const int n) {
336
337 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
338 const int str = blockDim.x * gridDim.x;
339
340
341 for (int i = idx; i < n; i+= str) {
342 T tmp = 0.0;
343 for (int j = 0; j < p_cur; j ++) {
344 tmp += p[j][i]*alpha[j];
345 }
346 x[i] += tmp;
347 }
348}
349
353template< typename T >
355 const T * __restrict__ b,
356 const T c1,
357 const int n) {
358
359 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
360 const int str = blockDim.x * gridDim.x;
361
362 for (int i = idx; i < n; i += str) {
363 a[i] = a[i] + c1 * b[i];
364 }
365}
366
370template< typename T >
372 const T * __restrict__ b,
373 const T c1,
374 const int n) {
375
376 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
377 const int str = blockDim.x * gridDim.x;
378
379 for (int i = idx; i < n; i += str) {
380 a[i] = a[i] + c1 * (b[i] * b[i]);
381 }
382}
383
387template< typename T >
389 const T * __restrict__ b,
390 const T * __restrict__ c,
391 const T c1,
392 const T c2,
393 const int n) {
394
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
397
398 for (int i = idx; i < n; i += str) {
399 a[i] = c1 * b[i] + c2 * c[i];
400 }
401}
402
406template< typename T >
408 const T * __restrict__ b,
409 const T * __restrict__ c,
410 const T * __restrict__ d,
411 const T c1,
412 const T c2,
413 const T c3,
414 const int n) {
415
416 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
417 const int str = blockDim.x * gridDim.x;
418
419 for (int i = idx; i < n; i += str) {
420 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
421 }
422}
423
427template< typename T >
429 const T * __restrict__ b,
430 const T * __restrict__ c,
431 const T * __restrict__ d,
432 const T * __restrict__ e,
433 const T c1,
434 const T c2,
435 const T c3,
436 const T c4,
437 const int n) {
438
439 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440 const int str = blockDim.x * gridDim.x;
441
442 for (int i = idx; i < n; i += str) {
443 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
444 }
445}
446
450template< typename T >
452 const int n) {
453
454 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
455 const int str = blockDim.x * gridDim.x;
456 const T one = 1.0;
457
458 for (int i = idx; i < n; i += str) {
459 a[i] = one / a[i];
460 }
461}
462
466template< typename T >
468 const T * __restrict__ b,
469 const int n) {
470
471 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
472 const int str = blockDim.x * gridDim.x;
473
474 for (int i = idx; i < n; i += str) {
475 a[i] = a[i] / b[i];
476 }
477}
478
482template< typename T >
484 const T * __restrict__ b,
485 const T * __restrict__ c,
486 const int n) {
487
488 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
489 const int str = blockDim.x * gridDim.x;
490
491 for (int i = idx; i < n; i += str) {
492 a[i] = b[i] / c[i];
493 }
494}
495
499template< typename T >
501 const T * __restrict__ b,
502 const int n) {
503
504 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
505 const int str = blockDim.x * gridDim.x;
506
507 for (int i = idx; i < n; i += str) {
508 a[i] = a[i] * b[i];
509 }
510}
511
515template< typename T >
517 const T * __restrict__ b,
518 const T * __restrict__ c,
519 const int n) {
520
521 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
522 const int str = blockDim.x * gridDim.x;
523
524 for (int i = idx; i < n; i += str) {
525 a[i] = b[i] * c[i];
526 }
527}
528
532template< typename T >
534 const T * __restrict__ b,
535 const T * __restrict__ c,
536 const int n) {
537
538 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
539 const int str = blockDim.x * gridDim.x;
540
541 for (int i = idx; i < n; i += str) {
542 a[i] = a[i] - b[i] * c[i];
543 }
544}
545
549template< typename T >
551 const T * __restrict__ b,
552 const int n) {
553
554 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
555 const int str = blockDim.x * gridDim.x;
556
557 for (int i = idx; i < n; i += str) {
558 a[i] = a[i] - b[i];
559 }
560}
561
565template< typename T >
567 const T * __restrict__ b,
568 const T * __restrict__ c,
569 const int n) {
570
571 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
572 const int str = blockDim.x * gridDim.x;
573
574 for (int i = idx; i < n; i += str) {
575 a[i] = b[i] - c[i];
576 }
577}
578
582template< typename T >
584 const T * __restrict__ b,
585 const T * __restrict__ c,
586 const int n) {
587
588 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
589 const int str = blockDim.x * gridDim.x;
590
591 for (int i = idx; i < n; i += str) {
592 a[i] = a[i] + b[i] * c[i];
593 }
594
595}
596
600template< typename T >
602 const T * __restrict__ b,
603 const T * __restrict__ c,
604 const T * __restrict__ d,
605 const int n) {
606
607 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608 const int str = blockDim.x * gridDim.x;
609
610 for (int i = idx; i < n; i += str) {
611 a[i] = a[i] + b[i] * c[i] * d[i];
612 }
613
614}
615
619template< typename T >
621 const T * __restrict__ b,
622 const T * __restrict__ c,
623 const T s,
624 const int n) {
625
626 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
627 const int str = blockDim.x * gridDim.x;
628
629 for (int i = idx; i < n; i += str) {
630 a[i] = a[i] + s * b[i] * c[i];
631 }
632
633}
634
638template< typename T >
640 const T * __restrict__ u1,
641 const T * __restrict__ u2,
642 const T * __restrict__ u3,
643 const T * __restrict__ v1,
644 const T * __restrict__ v2,
645 const T * __restrict__ v3,
646 const int n) {
647
648 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
649 const int str = blockDim.x * gridDim.x;
650
651 for (int i = idx; i < n; i += str) {
652 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
653 }
654
655}
656
660template< typename T >
662 T * __restrict__ u2,
663 T * __restrict__ u3,
664 const T * __restrict__ v1,
665 const T * __restrict__ v2,
666 const T * __restrict__ v3,
667 const T * __restrict__ w1,
668 const T * __restrict__ w2,
669 const T * __restrict__ w3,
670 const int n) {
671
672 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
673 const int str = blockDim.x * gridDim.x;
674
675 for (int i = idx; i < n; i += str) {
676 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
677 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
678 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
679 }
680}
681
685template< typename T>
687 val += __shfl_down(val, 32);
688 val += __shfl_down(val, 16);
689 val += __shfl_down(val, 8);
690 val += __shfl_down(val, 4);
691 val += __shfl_down(val, 2);
692 val += __shfl_down(val, 1);
693 return val;
694}
695
699template< typename T >
700__global__ void reduce_kernel(T * bufred, const int n) {
701
702 T sum = 0;
703 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
704 const int str = blockDim.x * gridDim.x;
705 for (int i = idx; i<n ; i += str)
706 {
707 sum += bufred[i];
708 }
709
710 __shared__ T shared[64];
711 unsigned int lane = threadIdx.x % warpSize;
712 unsigned int wid = threadIdx.x / warpSize;
713
715 if (lane == 0)
716 shared[wid] = sum;
718
719 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
720 if (wid == 0)
722
723 if (threadIdx.x == 0)
724 bufred[blockIdx.x] = sum;
725}
726
731template< typename T >
733 const int n,
734 const int j
735 ) {
736 __shared__ T buf[1024] ;
737 const int idx = threadIdx.x;
738 const int y= blockIdx.x;
739 const int step = blockDim.x;
740
741 buf[idx]=0;
742 for (int i=idx ; i<n ; i+=step)
743 {
744 buf[idx] += bufred[i*j + y];
745 }
747
748 int i = 512;
749 while (i != 0)
750 {
751 if(threadIdx.x < i && (threadIdx.x + i) < n )
752 {
753 buf[threadIdx.x] += buf[threadIdx.x + i] ;
754 }
755 i = i>>1;
757 }
758
759 bufred[y] = buf[0];
760}
761
765template< typename T >
767 const T * b,
768 const T * c,
769 T * buf_h,
770 const int n) {
771
772 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
773 const int str = blockDim.x * gridDim.x;
774
775 const unsigned int lane = threadIdx.x % warpSize;
776 const unsigned int wid = threadIdx.x / warpSize;
777
778 __shared__ T shared[64];
779 T sum = 0.0;
780 for (int i = idx; i < n; i+= str) {
781 sum += a[i] * b[i] * c[i];
782 }
783
785 if (lane == 0)
786 shared[wid] = sum;
788
789 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
790 if (wid == 0)
792
793 if (threadIdx.x == 0)
794 buf_h[blockIdx.x] = sum;
795}
796
800template< typename T >
802 const T ** b,
803 const T * c,
804 T * buf_h,
805 const int j,
806 const int n) {
807
808 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
809 const int str = blockDim.y * gridDim.x;
810 const int y = threadIdx.x;
811
812 __shared__ T buf[1024];
813 T tmp = 0;
814 if(y < j){
815 for (int i = idx; i < n; i+= str) {
816 tmp += a[i] * b[threadIdx.x][i] * c[i];
817 }
818 }
819
820 buf[threadIdx.y*blockDim.x+y] = tmp;
822
823 int i = blockDim.y>>1;
824 while (i != 0) {
825 if (threadIdx.y < i) {
826 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
827 }
829 i = i>>1;
830 }
831 if (threadIdx.y == 0) {
832 if( y < j) {
833 buf_h[j*blockIdx.x+y] = buf[y];
834 }
835 }
836}
837
841template< typename T >
843 const T * b,
844 T * buf_h,
845 const int n) {
846
847 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
848 const int str = blockDim.x * gridDim.x;
849
850 const unsigned int lane = threadIdx.x % warpSize;
851 const unsigned int wid = threadIdx.x / warpSize;
852
853 __shared__ T shared[64];
854 T sum = 0.0;
855 for (int i = idx; i < n; i+= str) {
856 sum += a[i] * b[i];
857 }
858
860 if (lane == 0)
861 shared[wid] = sum;
863
864 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
865 if (wid == 0)
867
868 if (threadIdx.x == 0)
869 buf_h[blockIdx.x] = sum;
870
871}
872
876template< typename T >
878 const T * b,
879 T * buf_h,
880 const int n) {
881
882 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
883 const int str = blockDim.x * gridDim.x;
884
885 const unsigned int lane = threadIdx.x % warpSize;
886 const unsigned int wid = threadIdx.x / warpSize;
887
888 __shared__ T shared[64];
889 T sum = 0.0;
890 for (int i = idx; i < n; i+= str) {
891 sum += pow(a[i] - b[i], 2.0);
892 }
893
895 if (lane == 0)
896 shared[wid] = sum;
898
899 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
900 if (wid == 0)
902
903 if (threadIdx.x == 0)
904 buf_h[blockIdx.x] = sum;
905
906}
907
911template< typename T >
913 T * buf_h,
914 const int n) {
915
916 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
917 const int str = blockDim.x * gridDim.x;
918
919 const unsigned int lane = threadIdx.x % warpSize;
920 const unsigned int wid = threadIdx.x / warpSize;
921
922 __shared__ T shared[64];
923 T sum = 0;
924 for (int i = idx; i<n ; i += str)
925 {
926 sum += a[i];
927 }
928
930 if (lane == 0)
931 shared[wid] = sum;
933
934 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
935 if (wid == 0)
937
938 if (threadIdx.x == 0)
939 buf_h[blockIdx.x] = sum;
940
941}
942
946template< typename T >
948 const int n) {
949
950 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
951 const int str = blockDim.x * gridDim.x;
952
953 for (int i = idx; i < n; i += str) {
954 a[i] = fabs(a[i]);
955 }
956}
957
958// ========================================================================== //
959// Kernels for the point-wise operations
960
965template <typename T>
966__global__ void
967 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
968
969 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
970 const int str = blockDim.x * gridDim.x;
971
972 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
973}
974
979template <typename T>
981 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
982 const int n) {
983
984 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
985 const int str = blockDim.x * gridDim.x;
986
987 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
988}
989
994template <typename T>
995__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
996
997 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
998 const int str = blockDim.x * gridDim.x;
999
1000 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1001}
1002
1007template <typename T>
1009 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1010
1011 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1012 const int str = blockDim.x * gridDim.x;
1013
1014 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1015}
1016
1021template <typename T>
1022__global__ void
1023 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1024
1025 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1026 const int str = blockDim.x * gridDim.x;
1027
1028 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1029}
1030
1035template <typename T>
1037 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1038 const int n) {
1039
1040 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1041 const int str = blockDim.x * gridDim.x;
1042
1043 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1044}
1045
1050template <typename T>
1051__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1052
1053 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1054 const int str = blockDim.x * gridDim.x;
1055
1056 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1057}
1058
1063template <typename T>
1065 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1066
1067 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1068 const int str = blockDim.x * gridDim.x;
1069
1070 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1071}
1072
1073#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:95
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:614
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40