Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2025, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int n_mask) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < n_mask; i += str) {
86 a[i] = b[mask[i]];
87 }
88}
89
90
94template< typename T >
96 T * __restrict__ b,
97 int * __restrict__ mask,
98 const int n,
99 const int n_mask) {
100
101 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
102 const int str = blockDim.x * gridDim.x;
103
104 for (int i = idx; i < n_mask; i += str) {
105 a[mask[i+1]-1] = b[i];
106 }
107}
108
109
113template< typename T >
115 T * __restrict__ b,
116 int * __restrict__ mask,
117 const int n,
118 const int m) {
119
120 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
121 const int str = blockDim.x * gridDim.x;
122
123 for (int i = idx; i < m; i += str) {
124#if __CUDA_ARCH__ >= 600
125 atomicAdd( &(a[mask[i+1]-1]), b[i]);
126#endif
127 }
128}
129
133template< typename T >
135 T * __restrict__ b,
136 int * __restrict__ mask,
137 const int n,
138 const int n_mask) {
139
140 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
141 const int str = blockDim.x * gridDim.x;
142
143 for (int i = idx; i < n_mask; i += str) {
144 a[mask[i+1]-1] = b[mask[i+1]-1];
145 }
146}
147
151template <typename T>
153 const T c,
154 const int size,
155 int* __restrict__ mask,
156 const int mask_size) {
157
158 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
159 const int str = blockDim.x * gridDim.x;
160
161 for (int i = idx; i < mask_size; i += str) { a[mask[i]] = c; }
162}
163
167template< typename T >
169 T * __restrict__ b,
170 const T c,
171 const int n) {
172
173 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
174 const int str = blockDim.x * gridDim.x;
175
176 for (int i = idx; i < n; i += str) {
177 a[i] = c * b[i];
178 }
179}
180
184template< typename T >
186 const T c,
187 const int n) {
188
189 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
190 const int str = blockDim.x * gridDim.x;
191
192 for (int i = idx; i < n; i += str) {
193 a[i] = c / a[i];
194 }
195}
196
200template< typename T >
202 T * __restrict__ b,
203 const T c,
204 const int n) {
205
206 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
207 const int str = blockDim.x * gridDim.x;
208
209 for (int i = idx; i < n; i += str) {
210 a[i] = c / b[i];
211 }
212}
213
217template< typename T >
219 const T c,
220 const int n) {
221
222 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
223 const int str = blockDim.x * gridDim.x;
224
225 for (int i = idx; i < n; i += str) {
226 a[i] = a[i] + c;
227 }
228}
229
233template< typename T >
235 T * __restrict__ b,
236 const T c,
237 const int n) {
238
239 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
240 const int str = blockDim.x * gridDim.x;
241
242 for (int i = idx; i < n; i += str) {
243 a[i] = b[i] + c;
244 }
245}
246
250template< typename T >
252 const T c,
253 const int n) {
254
255 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
256 const int str = blockDim.x * gridDim.x;
257
258 for (int i = idx; i < n; i += str) {
259 a[i] = c;
260 }
261}
262
266template< typename T >
268 const T * __restrict__ b,
269 const int n) {
270
271 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272 const int str = blockDim.x * gridDim.x;
273
274 for (int i = idx; i < n; i += str) {
275 a[i] = a[i] + b[i];
276 }
277}
278
282template< typename T >
284 const T * __restrict__ b,
285 const T * __restrict__ c,
286 const int n) {
287
288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
289 const int str = blockDim.x * gridDim.x;
290
291 for (int i = idx; i < n; i += str) {
292 a[i] = b[i] + c[i];
293 }
294}
295
299template< typename T >
301 const T * __restrict__ b,
302 const T * __restrict__ c,
303 const T * __restrict__ d,
304 const int n) {
305
306 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
307 const int str = blockDim.x * gridDim.x;
308
309 for (int i = idx; i < n; i += str) {
310 a[i] = b[i] + c[i] + d[i];
311 }
312}
313
317template< typename T >
319 const T * __restrict__ b,
320 const T c1,
321 const int n) {
322
323 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 const int str = blockDim.x * gridDim.x;
325
326 for (int i = idx; i < n; i += str) {
327 a[i] = c1 * a[i] + b[i];
328 }
329}
330
334template< typename T >
336 const T ** p,
337 const T * alpha,
338 const int p_cur,
339 const int n) {
340
341 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
342 const int str = blockDim.x * gridDim.x;
343
344
345 for (int i = idx; i < n; i+= str) {
346 T tmp = 0.0;
347 for (int j = 0; j < p_cur; j ++) {
348 tmp += p[j][i]*alpha[j];
349 }
350 x[i] += tmp;
351 }
352}
353
357template< typename T >
359 const T * __restrict__ b,
360 const T c1,
361 const int n) {
362
363 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
364 const int str = blockDim.x * gridDim.x;
365
366 for (int i = idx; i < n; i += str) {
367 a[i] = a[i] + c1 * b[i];
368 }
369}
370
374template< typename T >
376 const T * __restrict__ b,
377 const T c1,
378 const int n) {
379
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
382
383 for (int i = idx; i < n; i += str) {
384 a[i] = a[i] + c1 * (b[i] * b[i]);
385 }
386}
387
391template< typename T >
393 const T * __restrict__ b,
394 const T * __restrict__ c,
395 const T c1,
396 const T c2,
397 const int n) {
398
399 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
400 const int str = blockDim.x * gridDim.x;
401
402 for (int i = idx; i < n; i += str) {
403 a[i] = c1 * b[i] + c2 * c[i];
404 }
405}
406
410template< typename T >
412 const T * __restrict__ b,
413 const T * __restrict__ c,
414 const T * __restrict__ d,
415 const T c1,
416 const T c2,
417 const T c3,
418 const int n) {
419
420 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
421 const int str = blockDim.x * gridDim.x;
422
423 for (int i = idx; i < n; i += str) {
424 a[i] = c1 * b[i] + c2 * c[i] + c3 * d[i];
425 }
426}
427
431template< typename T >
433 const T * __restrict__ b,
434 const T * __restrict__ c,
435 const T * __restrict__ d,
436 const T * __restrict__ e,
437 const T c1,
438 const T c2,
439 const T c3,
440 const T c4,
441 const int n) {
442
443 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
444 const int str = blockDim.x * gridDim.x;
445
446 for (int i = idx; i < n; i += str) {
447 a[i] = a[i] + c1 * b[i] + c2 * c[i] + c3 * d[i] + c4 * e[i];
448 }
449}
450
454template< typename T >
456 const int n) {
457
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
460 const T one = 1.0;
461
462 for (int i = idx; i < n; i += str) {
463 a[i] = one / a[i];
464 }
465}
466
470template< typename T >
472 const T * __restrict__ b,
473 const int n) {
474
475 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
476 const int str = blockDim.x * gridDim.x;
477
478 for (int i = idx; i < n; i += str) {
479 a[i] = a[i] / b[i];
480 }
481}
482
486template< typename T >
488 const T * __restrict__ b,
489 const T * __restrict__ c,
490 const int n) {
491
492 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
493 const int str = blockDim.x * gridDim.x;
494
495 for (int i = idx; i < n; i += str) {
496 a[i] = b[i] / c[i];
497 }
498}
499
503template< typename T >
505 const T * __restrict__ b,
506 const int n) {
507
508 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
509 const int str = blockDim.x * gridDim.x;
510
511 for (int i = idx; i < n; i += str) {
512 a[i] = a[i] * b[i];
513 }
514}
515
519template< typename T >
521 const T * __restrict__ b,
522 const T * __restrict__ c,
523 const int n) {
524
525 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
526 const int str = blockDim.x * gridDim.x;
527
528 for (int i = idx; i < n; i += str) {
529 a[i] = b[i] * c[i];
530 }
531}
532
536template< typename T >
538 const T * __restrict__ b,
539 const T * __restrict__ c,
540 const int n) {
541
542 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
543 const int str = blockDim.x * gridDim.x;
544
545 for (int i = idx; i < n; i += str) {
546 a[i] = a[i] - b[i] * c[i];
547 }
548}
549
553template< typename T >
555 const T * __restrict__ b,
556 const int n) {
557
558 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
559 const int str = blockDim.x * gridDim.x;
560
561 for (int i = idx; i < n; i += str) {
562 a[i] = a[i] - b[i];
563 }
564}
565
569template< typename T >
571 const T * __restrict__ b,
572 const T * __restrict__ c,
573 const int n) {
574
575 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
576 const int str = blockDim.x * gridDim.x;
577
578 for (int i = idx; i < n; i += str) {
579 a[i] = b[i] - c[i];
580 }
581}
582
586template< typename T >
588 const T * __restrict__ b,
589 const T * __restrict__ c,
590 const int n) {
591
592 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
593 const int str = blockDim.x * gridDim.x;
594
595 for (int i = idx; i < n; i += str) {
596 a[i] = a[i] + b[i] * c[i];
597 }
598
599}
600
604template< typename T >
606 const T * __restrict__ b,
607 const T * __restrict__ c,
608 const T * __restrict__ d,
609 const int n) {
610
611 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
612 const int str = blockDim.x * gridDim.x;
613
614 for (int i = idx; i < n; i += str) {
615 a[i] = a[i] + b[i] * c[i] * d[i];
616 }
617
618}
619
623template< typename T >
625 const T * __restrict__ b,
626 const T * __restrict__ c,
627 const T s,
628 const int n) {
629
630 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
631 const int str = blockDim.x * gridDim.x;
632
633 for (int i = idx; i < n; i += str) {
634 a[i] = a[i] + s * b[i] * c[i];
635 }
636
637}
638
642template< typename T >
644 const T * __restrict__ u1,
645 const T * __restrict__ u2,
646 const T * __restrict__ u3,
647 const T * __restrict__ v1,
648 const T * __restrict__ v2,
649 const T * __restrict__ v3,
650 const int n) {
651
652 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
653 const int str = blockDim.x * gridDim.x;
654
655 for (int i = idx; i < n; i += str) {
656 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
657 }
658
659}
660
664template< typename T >
666 T * __restrict__ u2,
667 T * __restrict__ u3,
668 const T * __restrict__ v1,
669 const T * __restrict__ v2,
670 const T * __restrict__ v3,
671 const T * __restrict__ w1,
672 const T * __restrict__ w2,
673 const T * __restrict__ w3,
674 const int n) {
675
676 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
677 const int str = blockDim.x * gridDim.x;
678
679 for (int i = idx; i < n; i += str) {
680 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
681 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
682 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
683 }
684
685}
686
687
691template< typename T>
693 val += __shfl_down_sync(0xffffffff, val, 16);
694 val += __shfl_down_sync(0xffffffff, val, 8);
695 val += __shfl_down_sync(0xffffffff, val, 4);
696 val += __shfl_down_sync(0xffffffff, val, 2);
697 val += __shfl_down_sync(0xffffffff, val, 1);
698 return val;
699}
700
704template< typename T >
705__global__ void reduce_kernel(T * bufred, const int n) {
706
707 T sum = 0;
708 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
709 const int str = blockDim.x * gridDim.x;
710 for (int i = idx; i<n ; i += str)
711 {
712 sum += bufred[i];
713 }
714
715 __shared__ T shared[32];
716 unsigned int lane = threadIdx.x % warpSize;
717 unsigned int wid = threadIdx.x / warpSize;
718
720 if (lane == 0)
721 shared[wid] = sum;
723
724 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
725 if (wid == 0)
727
728 if (threadIdx.x == 0)
729 bufred[blockIdx.x] = sum;
730}
731
732
737template< typename T >
739 const int n,
740 const int j
741 ) {
742 __shared__ T buf[1024] ;
743 const int idx = threadIdx.x;
744 const int y= blockIdx.x;
745 const int step = blockDim.x;
746
747 buf[idx]=0;
748 for (int i=idx ; i<n ; i+=step)
749 {
750 buf[idx] += bufred[i*j + y];
751 }
753
754 int i = 512;
755 while (i != 0)
756 {
757 if(threadIdx.x < i && (threadIdx.x + i) < n )
758 {
759 buf[threadIdx.x] += buf[threadIdx.x + i] ;
760 }
761 i = i>>1;
763 }
764
765 bufred[y] = buf[0];
766}
767
768
772template< typename T >
774 const T * b,
775 const T * c,
776 T * buf_h,
777 const int n) {
778
779 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
780 const int str = blockDim.x * gridDim.x;
781
782 const unsigned int lane = threadIdx.x % warpSize;
783 const unsigned int wid = threadIdx.x / warpSize;
784
785 __shared__ T shared[32];
786 T sum = 0.0;
787 for (int i = idx; i < n; i+= str) {
788 sum += a[i] * b[i] * c[i];
789 }
790
792 if (lane == 0)
793 shared[wid] = sum;
795
796 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
797 if (wid == 0)
799
800 if (threadIdx.x == 0)
801 buf_h[blockIdx.x] = sum;
802}
803
807template< typename T >
809 const T ** b,
810 const T * c,
811 T * buf_h,
812 const int j,
813 const int n) {
814
815 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
816 const int str = blockDim.y * gridDim.x;
817 const int y = threadIdx.x;
818
819 __shared__ T buf[1024];
820 T tmp = 0;
821 if(y < j){
822 for (int i = idx; i < n; i+= str) {
823 tmp += a[i] * b[threadIdx.x][i] * c[i];
824 }
825 }
826
827 buf[threadIdx.y*blockDim.x+y] = tmp;
829
830 int i = blockDim.y>>1;
831 while (i != 0) {
832 if (threadIdx.y < i) {
833 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
834 }
836 i = i>>1;
837 }
838 if (threadIdx.y == 0) {
839 if( y < j) {
840 buf_h[j*blockIdx.x+y] = buf[y];
841 }
842 }
843}
844
848template< typename T >
850 const T * b,
851 T * buf_h,
852 const int n) {
853
854 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
855 const int str = blockDim.x * gridDim.x;
856
857 const unsigned int lane = threadIdx.x % warpSize;
858 const unsigned int wid = threadIdx.x / warpSize;
859
860 __shared__ T shared[32];
861 T sum = 0.0;
862 for (int i = idx; i < n; i+= str) {
863 sum += a[i] * b[i];
864 }
865
867 if (lane == 0)
868 shared[wid] = sum;
870
871 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
872 if (wid == 0)
874
875 if (threadIdx.x == 0)
876 buf_h[blockIdx.x] = sum;
877
878}
879
883template< typename T >
885 const T * b,
886 T * buf_h,
887 const int n) {
888
889 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
890 const int str = blockDim.x * gridDim.x;
891
892 const unsigned int lane = threadIdx.x % warpSize;
893 const unsigned int wid = threadIdx.x / warpSize;
894
895 __shared__ T shared[32];
896 T sum = 0.0;
897 for (int i = idx; i < n; i+= str) {
898 sum += pow(a[i] - b[i], 2.0);
899 }
900
902 if (lane == 0)
903 shared[wid] = sum;
905
906 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
907 if (wid == 0)
909
910 if (threadIdx.x == 0)
911 buf_h[blockIdx.x] = sum;
912
913}
914
918template< typename T >
920 T * buf_h,
921 const int n) {
922
923 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
924 const int str = blockDim.x * gridDim.x;
925
926 const unsigned int lane = threadIdx.x % warpSize;
927 const unsigned int wid = threadIdx.x / warpSize;
928
929 __shared__ T shared[32];
930 T sum = 0;
931 for (int i = idx; i<n ; i += str)
932 {
933 sum += a[i];
934 }
935
937 if (lane == 0)
938 shared[wid] = sum;
940
941 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
942 if (wid == 0)
944
945 if (threadIdx.x == 0)
946 buf_h[blockIdx.x] = sum;
947
948}
949
953template< typename T >
955 const int n) {
956
957 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
958 const int str = blockDim.x * gridDim.x;
959
960 for (int i = idx; i < n; i += str) {
961 a[i] = fabs(a[i]);
962 }
963}
964
965// ========================================================================== //
966// Kernels for the point-wise operations
967
972template <typename T>
973__global__ void
974 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
975
976 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
977 const int str = blockDim.x * gridDim.x;
978
979 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
980}
981
986template <typename T>
988 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
989 const int n) {
990
991 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
992 const int str = blockDim.x * gridDim.x;
993
994 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
995}
996
1001template <typename T>
1002__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1003
1004 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1005 const int str = blockDim.x * gridDim.x;
1006
1007 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
1008}
1009
1014template <typename T>
1016 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1017
1018 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1019 const int str = blockDim.x * gridDim.x;
1020
1021 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
1022}
1023
1028template <typename T>
1029__global__ void
1030 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
1031
1032 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1033 const int str = blockDim.x * gridDim.x;
1034
1035 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
1036}
1037
1042template <typename T>
1044 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
1045 const int n) {
1046
1047 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1048 const int str = blockDim.x * gridDim.x;
1049
1050 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
1051}
1052
1057template <typename T>
1058__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
1059
1060 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1061 const int str = blockDim.x * gridDim.x;
1062
1063 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
1064}
1065
1070template <typename T>
1072 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
1073
1074 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1075 const int str = blockDim.x * gridDim.x;
1076
1077 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
1078}
1079
1080#endif // __MATH_MATH_KERNEL_H__
const int i
const int e
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:95
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:614
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40