Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[mask[i+1]-1] = b[i];
87 }
88}
89
90
94template< typename T >
96 T * __restrict__ b,
97 int * __restrict__ mask,
98 const int n,
99 const int m) {
100
101 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
102 const int str = blockDim.x * gridDim.x;
103
104 for (int i = idx; i < m; i += str) {
105#if __CUDA_ARCH__ >= 600
106 atomicAdd( &(a[mask[i+1]-1]), b[i]);
107#endif
108 }
109}
110
114template< typename T >
116 T * __restrict__ b,
117 int * __restrict__ mask,
118 const int n,
119 const int n_mask) {
120
121 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
122 const int str = blockDim.x * gridDim.x;
123
124 for (int i = idx; i < n_mask; i += str) {
125 a[mask[i+1]-1] = b[mask[i+1]-1];
126 }
127}
128
132template <typename T>
134 const T c,
135 const int size,
136 int* __restrict__ mask,
137 const int mask_size) {
138
139 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
140 const int str = blockDim.x * gridDim.x;
141
142 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
143}
144
148template< typename T >
150 T * __restrict__ b,
151 const T c,
152 const int n) {
153
154 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
155 const int str = blockDim.x * gridDim.x;
156
157 for (int i = idx; i < n; i += str) {
158 a[i] = c * b[i];
159 }
160}
161
165template< typename T >
167 const T c,
168 const int n) {
169
170 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
171 const int str = blockDim.x * gridDim.x;
172
173 for (int i = idx; i < n; i += str) {
174 a[i] = c / a[i];
175 }
176}
177
181template< typename T >
183 T * __restrict__ b,
184 const T c,
185 const int n) {
186
187 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
188 const int str = blockDim.x * gridDim.x;
189
190 for (int i = idx; i < n; i += str) {
191 a[i] = c / b[i];
192 }
193}
194
198template< typename T >
200 const T c,
201 const int n) {
202
203 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
204 const int str = blockDim.x * gridDim.x;
205
206 for (int i = idx; i < n; i += str) {
207 a[i] = a[i] + c;
208 }
209}
210
214template< typename T >
216 T * __restrict__ b,
217 const T c,
218 const int n) {
219
220 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
221 const int str = blockDim.x * gridDim.x;
222
223 for (int i = idx; i < n; i += str) {
224 a[i] = b[i] + c;
225 }
226}
227
231template< typename T >
233 const T c,
234 const int n) {
235
236 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
237 const int str = blockDim.x * gridDim.x;
238
239 for (int i = idx; i < n; i += str) {
240 a[i] = c;
241 }
242}
243
247template< typename T >
249 const T * __restrict__ b,
250 const int n) {
251
252 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
253 const int str = blockDim.x * gridDim.x;
254
255 for (int i = idx; i < n; i += str) {
256 a[i] = a[i] + b[i];
257 }
258}
259
263template< typename T >
265 const T * __restrict__ b,
266 const T * __restrict__ c,
267 const int n) {
268
269 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
270 const int str = blockDim.x * gridDim.x;
271
272 for (int i = idx; i < n; i += str) {
273 a[i] = b[i] + c[i];
274 }
275}
276
280template< typename T >
282 const T * __restrict__ b,
283 const T * __restrict__ c,
284 const T * __restrict__ d,
285 const int n) {
286
287 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
288 const int str = blockDim.x * gridDim.x;
289
290 for (int i = idx; i < n; i += str) {
291 a[i] = b[i] + c[i] + d[i];
292 }
293}
294
298template< typename T >
300 const T * __restrict__ b,
301 const T c1,
302 const int n) {
303
304 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
305 const int str = blockDim.x * gridDim.x;
306
307 for (int i = idx; i < n; i += str) {
308 a[i] = c1 * a[i] + b[i];
309 }
310}
311
315template< typename T >
317 const T ** p,
318 const T * alpha,
319 const int p_cur,
320 const int n) {
321
322 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
323 const int str = blockDim.x * gridDim.x;
324
325
326 for (int i = idx; i < n; i+= str) {
327 T tmp = 0.0;
328 for (int j = 0; j < p_cur; j ++) {
329 tmp += p[j][i]*alpha[j];
330 }
331 x[i] += tmp;
332 }
333}
334
338template< typename T >
340 const T * __restrict__ b,
341 const T c1,
342 const int n) {
343
344 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
345 const int str = blockDim.x * gridDim.x;
346
347 for (int i = idx; i < n; i += str) {
348 a[i] = a[i] + c1 * b[i];
349 }
350}
351
355template< typename T >
357 const T * __restrict__ b,
358 const T c1,
359 const int n) {
360
361 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
362 const int str = blockDim.x * gridDim.x;
363
364 for (int i = idx; i < n; i += str) {
365 a[i] = a[i] + c1 * (b[i] * b[i]);
366 }
367}
368
372template< typename T >
374 const T * __restrict__ b,
375 const T * __restrict__ c,
376 const T c1,
377 const T c2,
378 const int n) {
379
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
382
383 for (int i = idx; i < n; i += str) {
384 a[i] = c1 * b[i] + c2 * c[i];
385 }
386}
387
391template< typename T >
393 const T * __restrict__ b,
394 const T * __restrict__ c,
395 const T * __restrict__ d,
396 const T c1,
397 const T c2,
398 const T c3,
399 const int n) {
400
401 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
402 const int str = blockDim.x * gridDim.x;
403
404 for (int i = idx; i < n; i += str) {
405 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
406 }
407}
408
412template< typename T >
414 const T * __restrict__ b,
415 const T * __restrict__ c,
416 const T * __restrict__ d,
417 const T * __restrict__ e,
418 const T c1,
419 const T c2,
420 const T c3,
421 const T c4,
422 const int n) {
423
424 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425 const int str = blockDim.x * gridDim.x;
426
427 for (int i = idx; i < n; i += str) {
428 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
429 }
430}
431
435template< typename T >
437 const int n) {
438
439 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440 const int str = blockDim.x * gridDim.x;
441 const T one = 1.0;
442
443 for (int i = idx; i < n; i += str) {
444 a[i] = one / a[i];
445 }
446}
447
451template< typename T >
453 const T * __restrict__ b,
454 const int n) {
455
456 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
457 const int str = blockDim.x * gridDim.x;
458
459 for (int i = idx; i < n; i += str) {
460 a[i] = a[i] / b[i];
461 }
462}
463
467template< typename T >
469 const T * __restrict__ b,
470 const T * __restrict__ c,
471 const int n) {
472
473 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
474 const int str = blockDim.x * gridDim.x;
475
476 for (int i = idx; i < n; i += str) {
477 a[i] = b[i] / c[i];
478 }
479}
480
484template< typename T >
486 const T * __restrict__ b,
487 const int n) {
488
489 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
490 const int str = blockDim.x * gridDim.x;
491
492 for (int i = idx; i < n; i += str) {
493 a[i] = a[i] * b[i];
494 }
495}
496
500template< typename T >
502 const T * __restrict__ b,
503 const T * __restrict__ c,
504 const int n) {
505
506 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
507 const int str = blockDim.x * gridDim.x;
508
509 for (int i = idx; i < n; i += str) {
510 a[i] = b[i] * c[i];
511 }
512}
513
517template< typename T >
519 const T * __restrict__ b,
520 const T * __restrict__ c,
521 const int n) {
522
523 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
524 const int str = blockDim.x * gridDim.x;
525
526 for (int i = idx; i < n; i += str) {
527 a[i] = a[i] - b[i] * c[i];
528 }
529}
530
534template< typename T >
536 const T * __restrict__ b,
537 const int n) {
538
539 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
540 const int str = blockDim.x * gridDim.x;
541
542 for (int i = idx; i < n; i += str) {
543 a[i] = a[i] - b[i];
544 }
545}
546
550template< typename T >
552 const T * __restrict__ b,
553 const T * __restrict__ c,
554 const int n) {
555
556 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
557 const int str = blockDim.x * gridDim.x;
558
559 for (int i = idx; i < n; i += str) {
560 a[i] = b[i] - c[i];
561 }
562}
563
567template< typename T >
569 const T * __restrict__ b,
570 const T * __restrict__ c,
571 const int n) {
572
573 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
574 const int str = blockDim.x * gridDim.x;
575
576 for (int i = idx; i < n; i += str) {
577 a[i] = a[i] + b[i] * c[i];
578 }
579
580}
581
585template< typename T >
587 const T * __restrict__ b,
588 const T * __restrict__ c,
589 const T * __restrict__ d,
590 const int n) {
591
592 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
593 const int str = blockDim.x * gridDim.x;
594
595 for (int i = idx; i < n; i += str) {
596 a[i] = a[i] + b[i] * c[i] * d[i];
597 }
598
599}
600
604template< typename T >
606 const T * __restrict__ b,
607 const T * __restrict__ c,
608 const T s,
609 const int n) {
610
611 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
612 const int str = blockDim.x * gridDim.x;
613
614 for (int i = idx; i < n; i += str) {
615 a[i] = a[i] + s * b[i] * c[i];
616 }
617
618}
619
623template< typename T >
625 const T * __restrict__ u1,
626 const T * __restrict__ u2,
627 const T * __restrict__ u3,
628 const T * __restrict__ v1,
629 const T * __restrict__ v2,
630 const T * __restrict__ v3,
631 const int n) {
632
633 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
634 const int str = blockDim.x * gridDim.x;
635
636 for (int i = idx; i < n; i += str) {
637 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
638 }
639
640}
641
645template< typename T >
647 T * __restrict__ u2,
648 T * __restrict__ u3,
649 const T * __restrict__ v1,
650 const T * __restrict__ v2,
651 const T * __restrict__ v3,
652 const T * __restrict__ w1,
653 const T * __restrict__ w2,
654 const T * __restrict__ w3,
655 const int n) {
656
657 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
658 const int str = blockDim.x * gridDim.x;
659
660 for (int i = idx; i < n; i += str) {
661 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
662 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
663 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
664 }
665
666}
667
668
672template< typename T>
674 val += __shfl_down_sync(0xffffffff, val, 16);
675 val += __shfl_down_sync(0xffffffff, val, 8);
676 val += __shfl_down_sync(0xffffffff, val, 4);
677 val += __shfl_down_sync(0xffffffff, val, 2);
678 val += __shfl_down_sync(0xffffffff, val, 1);
679 return val;
680}
681
685template< typename T >
686__global__ void reduce_kernel(T * bufred, const int n) {
687
688 T sum = 0;
689 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
690 const int str = blockDim.x * gridDim.x;
691 for (int i = idx; i<n ; i += str)
692 {
693 sum += bufred[i];
694 }
695
696 __shared__ T shared[32];
697 unsigned int lane = threadIdx.x % warpSize;
698 unsigned int wid = threadIdx.x / warpSize;
699
701 if (lane == 0)
702 shared[wid] = sum;
704
705 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
706 if (wid == 0)
708
709 if (threadIdx.x == 0)
710 bufred[blockIdx.x] = sum;
711}
712
713
718template< typename T >
720 const int n,
721 const int j
722 ) {
723 __shared__ T buf[1024] ;
724 const int idx = threadIdx.x;
725 const int y= blockIdx.x;
726 const int step = blockDim.x;
727
728 buf[idx]=0;
729 for (int i=idx ; i<n ; i+=step)
730 {
731 buf[idx] += bufred[i*j + y];
732 }
734
735 int i = 512;
736 while (i != 0)
737 {
738 if(threadIdx.x < i && (threadIdx.x + i) < n )
739 {
740 buf[threadIdx.x] += buf[threadIdx.x + i] ;
741 }
742 i = i>>1;
744 }
745
746 bufred[y] = buf[0];
747}
748
749
753template< typename T >
755 const T * b,
756 const T * c,
757 T * buf_h,
758 const int n) {
759
760 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
761 const int str = blockDim.x * gridDim.x;
762
763 const unsigned int lane = threadIdx.x % warpSize;
764 const unsigned int wid = threadIdx.x / warpSize;
765
766 __shared__ T shared[32];
767 T sum = 0.0;
768 for (int i = idx; i < n; i+= str) {
769 sum += a[i] * b[i] * c[i];
770 }
771
773 if (lane == 0)
774 shared[wid] = sum;
776
777 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
778 if (wid == 0)
780
781 if (threadIdx.x == 0)
782 buf_h[blockIdx.x] = sum;
783}
784
788template< typename T >
790 const T ** b,
791 const T * c,
792 T * buf_h,
793 const int j,
794 const int n) {
795
796 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
797 const int str = blockDim.y * gridDim.x;
798 const int y = threadIdx.x;
799
800 __shared__ T buf[1024];
801 T tmp = 0;
802 if(y < j){
803 for (int i = idx; i < n; i+= str) {
804 tmp += a[i] * b[threadIdx.x][i] * c[i];
805 }
806 }
807
808 buf[threadIdx.y*blockDim.x+y] = tmp;
810
811 int i = blockDim.y>>1;
812 while (i != 0) {
813 if (threadIdx.y < i) {
814 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
815 }
817 i = i>>1;
818 }
819 if (threadIdx.y == 0) {
820 if( y < j) {
821 buf_h[j*blockIdx.x+y] = buf[y];
822 }
823 }
824}
825
829template< typename T >
831 const T * b,
832 T * buf_h,
833 const int n) {
834
835 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
836 const int str = blockDim.x * gridDim.x;
837
838 const unsigned int lane = threadIdx.x % warpSize;
839 const unsigned int wid = threadIdx.x / warpSize;
840
841 __shared__ T shared[32];
842 T sum = 0.0;
843 for (int i = idx; i < n; i+= str) {
844 sum += a[i] * b[i];
845 }
846
848 if (lane == 0)
849 shared[wid] = sum;
851
852 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
853 if (wid == 0)
855
856 if (threadIdx.x == 0)
857 buf_h[blockIdx.x] = sum;
858
859}
860
864template< typename T >
866 const T * b,
867 T * buf_h,
868 const int n) {
869
870 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
871 const int str = blockDim.x * gridDim.x;
872
873 const unsigned int lane = threadIdx.x % warpSize;
874 const unsigned int wid = threadIdx.x / warpSize;
875
876 __shared__ T shared[32];
877 T sum = 0.0;
878 for (int i = idx; i < n; i+= str) {
879 sum += pow(a[i] - b[i], 2.0);
880 }
881
883 if (lane == 0)
884 shared[wid] = sum;
886
887 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
888 if (wid == 0)
890
891 if (threadIdx.x == 0)
892 buf_h[blockIdx.x] = sum;
893
894}
895
899template< typename T >
901 T * buf_h,
902 const int n) {
903
904 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
905 const int str = blockDim.x * gridDim.x;
906
907 const unsigned int lane = threadIdx.x % warpSize;
908 const unsigned int wid = threadIdx.x / warpSize;
909
910 __shared__ T shared[32];
911 T sum = 0;
912 for (int i = idx; i<n ; i += str)
913 {
914 sum += a[i];
915 }
916
918 if (lane == 0)
919 shared[wid] = sum;
921
922 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
923 if (wid == 0)
925
926 if (threadIdx.x == 0)
927 buf_h[blockIdx.x] = sum;
928
929}
930
934template< typename T >
936 const int n) {
937
938 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
939 const int str = blockDim.x * gridDim.x;
940
941 for (int i = idx; i < n; i += str) {
942 a[i] = fabs(a[i]);
943 }
944}
945
946// ========================================================================== //
947// Kernels for the point-wise operations
948
953template <typename T>
954__global__ void
955 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
956
957 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
958 const int str = blockDim.x * gridDim.x;
959
960 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
961}
962
967template <typename T>
969 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
970 const int n) {
971
972 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
973 const int str = blockDim.x * gridDim.x;
974
975 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
976}
977
982template <typename T>
983__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
984
985 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
986 const int str = blockDim.x * gridDim.x;
987
988 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
989}
990
995template <typename T>
997 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
998
999 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1000 const int str = blockDim.x * gridDim.x;
1001
1002 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1003}
1004
1009template <typename T>
1010__global__ void
1011 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1012
1013 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1014 const int str = blockDim.x * gridDim.x;
1015
1016 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1017}
1018
1023template <typename T>
1025 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1026 const int n) {
1027
1028 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1029 const int str = blockDim.x * gridDim.x;
1030
1031 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1032}
1033
1038template <typename T>
1039__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1040
1041 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1042 const int str = blockDim.x * gridDim.x;
1043
1044 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1045}
1046
1051template <typename T>
1053 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1054
1055 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1056 const int str = blockDim.x * gridDim.x;
1057
1058 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1059}
1060
1061#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:95
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:599
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40