Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[mask[i+1]-1] = b[i];
86 }
87}
88
92template< typename T >
94 T * __restrict__ b,
95 int * __restrict__ mask,
96 const int n,
97 const int n_mask) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < n_mask; i += str) {
103 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
104 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
105 }
106}
107
111template< typename T >
113 T * __restrict__ b,
114 int * __restrict__ mask,
115 const int n,
116 const int n_mask) {
117
118 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
119 const int str = blockDim.x * gridDim.x;
120
121 for (int i = idx; i < n_mask; i += str) {
122 a[mask[i+1]-1] = b[mask[i+1]-1];
123 }
124}
125
129template< typename T >
131 const T c,
132 const int n,
133 int* __restrict__ mask,
134 const int n_mask) {
135
136 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
137 const int str = blockDim.x * gridDim.x;
138
139 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
140}
141
145template< typename T >
147 T * __restrict__ b,
148 const T c,
149 const int n) {
150
151 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
152 const int str = blockDim.x * gridDim.x;
153
154 for (int i = idx; i < n; i += str) {
155 a[i] = c * b[i];
156 }
157}
158
162template< typename T >
164 const T c,
165 const int n) {
166
167 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
168 const int str = blockDim.x * gridDim.x;
169
170 for (int i = idx; i < n; i += str) {
171 a[i] = c / a[i];
172 }
173}
174
178template< typename T >
180 T * __restrict__ b,
181 const T c,
182 const int n) {
183
184 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
185 const int str = blockDim.x * gridDim.x;
186
187 for (int i = idx; i < n; i += str) {
188 a[i] = c / b[i];
189 }
190}
191
195template< typename T >
197 const T c,
198 const int n) {
199
200 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
201 const int str = blockDim.x * gridDim.x;
202
203 for (int i = idx; i < n; i += str) {
204 a[i] = a[i] + c;
205 }
206}
207
211template< typename T >
213 T * __restrict__ b,
214 const T c,
215 const int n) {
216
217 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
218 const int str = blockDim.x * gridDim.x;
219
220 for (int i = idx; i < n; i += str) {
221 a[i] = b[i] + c;
222 }
223}
224
228template< typename T >
230 const T c,
231 const int n) {
232
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
235
236 for (int i = idx; i < n; i += str) {
237 a[i] = c;
238 }
239}
240
244template< typename T >
246 const T * __restrict__ b,
247 const int n) {
248
249 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
250 const int str = blockDim.x * gridDim.x;
251
252 for (int i = idx; i < n; i += str) {
253 a[i] = a[i] + b[i];
254 }
255}
256
260template< typename T >
262 const T * __restrict__ b,
263 const T * __restrict__ c,
264 const int n) {
265
266 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
267 const int str = blockDim.x * gridDim.x;
268
269 for (int i = idx; i < n; i += str) {
270 a[i] = b[i] + c[i];
271 }
272}
273
277template< typename T >
279 const T * __restrict__ b,
280 const T * __restrict__ c,
281 const T * __restrict__ d,
282 const int n) {
283
284 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
285 const int str = blockDim.x * gridDim.x;
286
287 for (int i = idx; i < n; i += str) {
288 a[i] = b[i] + c[i] + d[i];
289 }
290}
291
295template< typename T >
297 const T * __restrict__ b,
298 const T c1,
299 const int n) {
300
301 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
302 const int str = blockDim.x * gridDim.x;
303
304 for (int i = idx; i < n; i += str) {
305 a[i] = c1 * a[i] + b[i];
306 }
307}
308
312template< typename T >
314 const T ** p,
315 const T * alpha,
316 const int p_cur,
317 const int n) {
318
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
321
322
323 for (int i = idx; i < n; i+= str) {
324 T tmp = 0.0;
325 for (int j = 0; j < p_cur; j ++) {
326 tmp += p[j][i]*alpha[j];
327 }
328 x[i] += tmp;
329 }
330}
331
335template< typename T >
337 const T * __restrict__ b,
338 const T c1,
339 const int n) {
340
341 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
342 const int str = blockDim.x * gridDim.x;
343
344 for (int i = idx; i < n; i += str) {
345 a[i] = a[i] + c1 * b[i];
346 }
347}
348
352template< typename T >
354 const T * __restrict__ b,
355 const T c1,
356 const int n) {
357
358 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
359 const int str = blockDim.x * gridDim.x;
360
361 for (int i = idx; i < n; i += str) {
362 a[i] = a[i] + c1 * (b[i] * b[i]);
363 }
364}
365
369template< typename T >
371 const T * __restrict__ b,
372 const T * __restrict__ c,
373 const T c1,
374 const T c2,
375 const int n) {
376
377 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
378 const int str = blockDim.x * gridDim.x;
379
380 for (int i = idx; i < n; i += str) {
381 a[i] = c1 * b[i] + c2 * c[i];
382 }
383}
384
388template< typename T >
390 const T * __restrict__ b,
391 const T * __restrict__ c,
392 const T * __restrict__ d,
393 const T c1,
394 const T c2,
395 const T c3,
396 const int n) {
397
398 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
399 const int str = blockDim.x * gridDim.x;
400
401 for (int i = idx; i < n; i += str) {
402 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
403 }
404}
405
409template< typename T >
411 const T * __restrict__ b,
412 const T * __restrict__ c,
413 const T * __restrict__ d,
414 const T * __restrict__ e,
415 const T c1,
416 const T c2,
417 const T c3,
418 const T c4,
419 const int n) {
420
421 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
422 const int str = blockDim.x * gridDim.x;
423
424 for (int i = idx; i < n; i += str) {
425 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
426 }
427}
428
432template< typename T >
434 const int n) {
435
436 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
437 const int str = blockDim.x * gridDim.x;
438 const T one = 1.0;
439
440 for (int i = idx; i < n; i += str) {
441 a[i] = one / a[i];
442 }
443}
444
448template< typename T >
450 const T * __restrict__ b,
451 const int n) {
452
453 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
454 const int str = blockDim.x * gridDim.x;
455
456 for (int i = idx; i < n; i += str) {
457 a[i] = a[i] / b[i];
458 }
459}
460
464template< typename T >
466 const T * __restrict__ b,
467 const T * __restrict__ c,
468 const int n) {
469
470 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
471 const int str = blockDim.x * gridDim.x;
472
473 for (int i = idx; i < n; i += str) {
474 a[i] = b[i] / c[i];
475 }
476}
477
481template< typename T >
483 const T * __restrict__ b,
484 const int n) {
485
486 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
487 const int str = blockDim.x * gridDim.x;
488
489 for (int i = idx; i < n; i += str) {
490 a[i] = a[i] * b[i];
491 }
492}
493
497template< typename T >
499 const T * __restrict__ b,
500 const T * __restrict__ c,
501 const int n) {
502
503 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
504 const int str = blockDim.x * gridDim.x;
505
506 for (int i = idx; i < n; i += str) {
507 a[i] = b[i] * c[i];
508 }
509}
510
514template< typename T >
516 const T * __restrict__ b,
517 const T * __restrict__ c,
518 const int n) {
519
520 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
521 const int str = blockDim.x * gridDim.x;
522
523 for (int i = idx; i < n; i += str) {
524 a[i] = a[i] - b[i] * c[i];
525 }
526}
527
531template< typename T >
533 const T * __restrict__ b,
534 const int n) {
535
536 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
537 const int str = blockDim.x * gridDim.x;
538
539 for (int i = idx; i < n; i += str) {
540 a[i] = a[i] - b[i];
541 }
542}
543
547template< typename T >
549 const T * __restrict__ b,
550 const T * __restrict__ c,
551 const int n) {
552
553 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
554 const int str = blockDim.x * gridDim.x;
555
556 for (int i = idx; i < n; i += str) {
557 a[i] = b[i] - c[i];
558 }
559}
560
564template< typename T >
566 const T * __restrict__ b,
567 const T * __restrict__ c,
568 const int n) {
569
570 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
571 const int str = blockDim.x * gridDim.x;
572
573 for (int i = idx; i < n; i += str) {
574 a[i] = a[i] + b[i] * c[i];
575 }
576
577}
578
582template< typename T >
584 const T * __restrict__ b,
585 const T * __restrict__ c,
586 const T * __restrict__ d,
587 const int n) {
588
589 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
590 const int str = blockDim.x * gridDim.x;
591
592 for (int i = idx; i < n; i += str) {
593 a[i] = a[i] + b[i] * c[i] * d[i];
594 }
595
596}
597
601template< typename T >
603 const T * __restrict__ b,
604 const T * __restrict__ c,
605 const T s,
606 const int n) {
607
608 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
609 const int str = blockDim.x * gridDim.x;
610
611 for (int i = idx; i < n; i += str) {
612 a[i] = a[i] + s * b[i] * c[i];
613 }
614
615}
616
620template< typename T >
622 const T * __restrict__ u1,
623 const T * __restrict__ u2,
624 const T * __restrict__ u3,
625 const T * __restrict__ v1,
626 const T * __restrict__ v2,
627 const T * __restrict__ v3,
628 const int n) {
629
630 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
631 const int str = blockDim.x * gridDim.x;
632
633 for (int i = idx; i < n; i += str) {
634 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
635 }
636
637}
638
642template< typename T >
644 T * __restrict__ u2,
645 T * __restrict__ u3,
646 const T * __restrict__ v1,
647 const T * __restrict__ v2,
648 const T * __restrict__ v3,
649 const T * __restrict__ w1,
650 const T * __restrict__ w2,
651 const T * __restrict__ w3,
652 const int n) {
653
654 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
655 const int str = blockDim.x * gridDim.x;
656
657 for (int i = idx; i < n; i += str) {
658 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
659 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
660 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
661 }
662}
663
667template< typename T>
669 val += __shfl_down(val, 32);
670 val += __shfl_down(val, 16);
671 val += __shfl_down(val, 8);
672 val += __shfl_down(val, 4);
673 val += __shfl_down(val, 2);
674 val += __shfl_down(val, 1);
675 return val;
676}
677
681template< typename T >
682__global__ void reduce_kernel(T * bufred, const int n) {
683
684 T sum = 0;
685 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
686 const int str = blockDim.x * gridDim.x;
687 for (int i = idx; i<n ; i += str)
688 {
689 sum += bufred[i];
690 }
691
692 __shared__ T shared[64];
693 unsigned int lane = threadIdx.x % warpSize;
694 unsigned int wid = threadIdx.x / warpSize;
695
697 if (lane == 0)
698 shared[wid] = sum;
700
701 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
702 if (wid == 0)
704
705 if (threadIdx.x == 0)
706 bufred[blockIdx.x] = sum;
707}
708
713template< typename T >
715 const int n,
716 const int j
717 ) {
718 __shared__ T buf[1024] ;
719 const int idx = threadIdx.x;
720 const int y= blockIdx.x;
721 const int step = blockDim.x;
722
723 buf[idx]=0;
724 for (int i=idx ; i<n ; i+=step)
725 {
726 buf[idx] += bufred[i*j + y];
727 }
729
730 int i = 512;
731 while (i != 0)
732 {
733 if(threadIdx.x < i && (threadIdx.x + i) < n )
734 {
735 buf[threadIdx.x] += buf[threadIdx.x + i] ;
736 }
737 i = i>>1;
739 }
740
741 bufred[y] = buf[0];
742}
743
747template< typename T >
749 const T * b,
750 const T * c,
751 T * buf_h,
752 const int n) {
753
754 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
755 const int str = blockDim.x * gridDim.x;
756
757 const unsigned int lane = threadIdx.x % warpSize;
758 const unsigned int wid = threadIdx.x / warpSize;
759
760 __shared__ T shared[64];
761 T sum = 0.0;
762 for (int i = idx; i < n; i+= str) {
763 sum += a[i] * b[i] * c[i];
764 }
765
767 if (lane == 0)
768 shared[wid] = sum;
770
771 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
772 if (wid == 0)
774
775 if (threadIdx.x == 0)
776 buf_h[blockIdx.x] = sum;
777}
778
782template< typename T >
784 const T ** b,
785 const T * c,
786 T * buf_h,
787 const int j,
788 const int n) {
789
790 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
791 const int str = blockDim.y * gridDim.x;
792 const int y = threadIdx.x;
793
794 __shared__ T buf[1024];
795 T tmp = 0;
796 if(y < j){
797 for (int i = idx; i < n; i+= str) {
798 tmp += a[i] * b[threadIdx.x][i] * c[i];
799 }
800 }
801
802 buf[threadIdx.y*blockDim.x+y] = tmp;
804
805 int i = blockDim.y>>1;
806 while (i != 0) {
807 if (threadIdx.y < i) {
808 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
809 }
811 i = i>>1;
812 }
813 if (threadIdx.y == 0) {
814 if( y < j) {
815 buf_h[j*blockIdx.x+y] = buf[y];
816 }
817 }
818}
819
823template< typename T >
825 const T * b,
826 T * buf_h,
827 const int n) {
828
829 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
830 const int str = blockDim.x * gridDim.x;
831
832 const unsigned int lane = threadIdx.x % warpSize;
833 const unsigned int wid = threadIdx.x / warpSize;
834
835 __shared__ T shared[64];
836 T sum = 0.0;
837 for (int i = idx; i < n; i+= str) {
838 sum += a[i] * b[i];
839 }
840
842 if (lane == 0)
843 shared[wid] = sum;
845
846 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
847 if (wid == 0)
849
850 if (threadIdx.x == 0)
851 buf_h[blockIdx.x] = sum;
852
853}
854
858template< typename T >
860 const T * b,
861 T * buf_h,
862 const int n) {
863
864 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
865 const int str = blockDim.x * gridDim.x;
866
867 const unsigned int lane = threadIdx.x % warpSize;
868 const unsigned int wid = threadIdx.x / warpSize;
869
870 __shared__ T shared[64];
871 T sum = 0.0;
872 for (int i = idx; i < n; i+= str) {
873 sum += pow(a[i] - b[i], 2.0);
874 }
875
877 if (lane == 0)
878 shared[wid] = sum;
880
881 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
882 if (wid == 0)
884
885 if (threadIdx.x == 0)
886 buf_h[blockIdx.x] = sum;
887
888}
889
893template< typename T >
895 T * buf_h,
896 const int n) {
897
898 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
899 const int str = blockDim.x * gridDim.x;
900
901 const unsigned int lane = threadIdx.x % warpSize;
902 const unsigned int wid = threadIdx.x / warpSize;
903
904 __shared__ T shared[64];
905 T sum = 0;
906 for (int i = idx; i<n ; i += str)
907 {
908 sum += a[i];
909 }
910
912 if (lane == 0)
913 shared[wid] = sum;
915
916 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
917 if (wid == 0)
919
920 if (threadIdx.x == 0)
921 buf_h[blockIdx.x] = sum;
922
923}
924
928template< typename T >
930 const int n) {
931
932 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
933 const int str = blockDim.x * gridDim.x;
934
935 for (int i = idx; i < n; i += str) {
936 a[i] = fabs(a[i]);
937 }
938}
939
940// ========================================================================== //
941// Kernels for the point-wise operations
942
947template <typename T>
948__global__ void
949 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
950
951 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
952 const int str = blockDim.x * gridDim.x;
953
954 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
955}
956
961template <typename T>
963 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
964 const int n) {
965
966 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
967 const int str = blockDim.x * gridDim.x;
968
969 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
970}
971
976template <typename T>
977__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
978
979 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
980 const int str = blockDim.x * gridDim.x;
981
982 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
983}
984
989template <typename T>
991 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
992
993 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
994 const int str = blockDim.x * gridDim.x;
995
996 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
997}
998
1003template <typename T>
1004__global__ void
1005 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1006
1007 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1008 const int str = blockDim.x * gridDim.x;
1009
1010 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1011}
1012
1017template <typename T>
1019 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1020 const int n) {
1021
1022 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1023 const int str = blockDim.x * gridDim.x;
1024
1025 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1026}
1027
1032template <typename T>
1033__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1034
1035 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1036 const int str = blockDim.x * gridDim.x;
1037
1038 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1039}
1040
1045template <typename T>
1047 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1048
1049 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1050 const int str = blockDim.x * gridDim.x;
1051
1052 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1053}
1054
1055#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:95
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:599
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40