Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int n_mask) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < n_mask; i += str) {
67 a[i] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int n_mask) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < n_mask; i += str) {
85 a[mask[i+1]-1] = b[i];
86 }
87}
88
92template< typename T >
94 T * __restrict__ b,
95 int * __restrict__ mask,
96 const int n,
97 const int n_mask) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < n_mask; i += str) {
103 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
104 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
105 }
106}
107
111template< typename T >
113 T * __restrict__ b,
114 int * __restrict__ mask,
115 const int n,
116 const int n_mask) {
117
118 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
119 const int str = blockDim.x * gridDim.x;
120
121 for (int i = idx; i < n_mask; i += str) {
122 a[mask[i+1]-1] = b[mask[i+1]-1];
123 }
124}
125
129template< typename T >
131 const T c,
132 const int n,
133 int* __restrict__ mask,
134 const int n_mask) {
135
136 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
137 const int str = blockDim.x * gridDim.x;
138
139 for (int i = idx; i < n_mask; i += str) { a[mask[i]] = c; }
140}
141
145template< typename T >
147 T * __restrict__ b,
148 const T c,
149 const int n) {
150
151 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
152 const int str = blockDim.x * gridDim.x;
153
154 for (int i = idx; i < n; i += str) {
155 a[i] = c * b[i];
156 }
157}
158
162template< typename T >
164 const T c,
165 const int n) {
166
167 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
168 const int str = blockDim.x * gridDim.x;
169
170 for (int i = idx; i < n; i += str) {
171 a[i] = c / a[i];
172 }
173}
174
178template< typename T >
180 T * __restrict__ b,
181 const T c,
182 const int n) {
183
184 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
185 const int str = blockDim.x * gridDim.x;
186
187 for (int i = idx; i < n; i += str) {
188 a[i] = c / b[i];
189 }
190}
191
195template< typename T >
197 const T c,
198 const int n) {
199
200 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
201 const int str = blockDim.x * gridDim.x;
202
203 for (int i = idx; i < n; i += str) {
204 a[i] = a[i] + c;
205 }
206}
207
211template< typename T >
213 T * __restrict__ b,
214 const T c,
215 const int n) {
216
217 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
218 const int str = blockDim.x * gridDim.x;
219
220 for (int i = idx; i < n; i += str) {
221 a[i] = b[i] + c;
222 }
223}
224
228template< typename T >
230 const T c,
231 const int n) {
232
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
235
236 for (int i = idx; i < n; i += str) {
237 a[i] = c;
238 }
239}
240
244template< typename T >
246 const T * __restrict__ b,
247 const int n) {
248
249 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
250 const int str = blockDim.x * gridDim.x;
251
252 for (int i = idx; i < n; i += str) {
253 a[i] = a[i] + b[i];
254 }
255}
256
260template< typename T >
262 const T * __restrict__ b,
263 const T * __restrict__ c,
264 const int n) {
265
266 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
267 const int str = blockDim.x * gridDim.x;
268
269 for (int i = idx; i < n; i += str) {
270 a[i] = b[i] + c[i];
271 }
272}
273
277template< typename T >
279 const T * __restrict__ b,
280 const T * __restrict__ c,
281 const T * __restrict__ d,
282 const int n) {
283
284 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
285 const int str = blockDim.x * gridDim.x;
286
287 for (int i = idx; i < n; i += str) {
288 a[i] = b[i] + c[i] + d[i];
289 }
290}
291
295template< typename T >
297 const T * __restrict__ b,
298 const T c1,
299 const int n) {
300
301 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
302 const int str = blockDim.x * gridDim.x;
303
304 for (int i = idx; i < n; i += str) {
305 a[i] = c1 * a[i] + b[i];
306 }
307}
308
312template< typename T >
314 const T ** p,
315 const T * alpha,
316 const int p_cur,
317 const int n) {
318
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
321
322
323 for (int i = idx; i < n; i+= str) {
324 T tmp = 0.0;
325 for (int j = 0; j < p_cur; j ++) {
326 tmp += p[j][i]*alpha[j];
327 }
328 x[i] += tmp;
329 }
330}
331
335template< typename T >
337 const T * __restrict__ b,
338 const T c1,
339 const int n) {
340
341 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
342 const int str = blockDim.x * gridDim.x;
343
344 for (int i = idx; i < n; i += str) {
345 a[i] = a[i] + c1 * b[i];
346 }
347}
348
352template< typename T >
354 const T * __restrict__ b,
355 const T c1,
356 const int n) {
357
358 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
359 const int str = blockDim.x * gridDim.x;
360
361 for (int i = idx; i < n; i += str) {
362 a[i] = a[i] + c1 * (b[i] * b[i]);
363 }
364}
365
369template< typename T >
371 const T * __restrict__ b,
372 const T * __restrict__ c,
373 const T c1,
374 const T c2,
375 const int n) {
376
377 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
378 const int str = blockDim.x * gridDim.x;
379
380 for (int i = idx; i < n; i += str) {
381 a[i] = c1 * b[i] + c2 * c[i];
382 }
383}
384
388template< typename T >
390 const int n) {
391
392 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
393 const int str = blockDim.x * gridDim.x;
394 const T one = 1.0;
395
396 for (int i = idx; i < n; i += str) {
397 a[i] = one / a[i];
398 }
399}
400
404template< typename T >
406 const T * __restrict__ b,
407 const int n) {
408
409 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
410 const int str = blockDim.x * gridDim.x;
411
412 for (int i = idx; i < n; i += str) {
413 a[i] = a[i] / b[i];
414 }
415}
416
420template< typename T >
422 const T * __restrict__ b,
423 const T * __restrict__ c,
424 const int n) {
425
426 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
427 const int str = blockDim.x * gridDim.x;
428
429 for (int i = idx; i < n; i += str) {
430 a[i] = b[i] / c[i];
431 }
432}
433
437template< typename T >
439 const T * __restrict__ b,
440 const int n) {
441
442 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
443 const int str = blockDim.x * gridDim.x;
444
445 for (int i = idx; i < n; i += str) {
446 a[i] = a[i] * b[i];
447 }
448}
449
453template< typename T >
455 const T * __restrict__ b,
456 const T * __restrict__ c,
457 const int n) {
458
459 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
460 const int str = blockDim.x * gridDim.x;
461
462 for (int i = idx; i < n; i += str) {
463 a[i] = b[i] * c[i];
464 }
465}
466
470template< typename T >
472 const T * __restrict__ b,
473 const T * __restrict__ c,
474 const int n) {
475
476 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
477 const int str = blockDim.x * gridDim.x;
478
479 for (int i = idx; i < n; i += str) {
480 a[i] = a[i] - b[i] * c[i];
481 }
482}
483
487template< typename T >
489 const T * __restrict__ b,
490 const int n) {
491
492 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
493 const int str = blockDim.x * gridDim.x;
494
495 for (int i = idx; i < n; i += str) {
496 a[i] = a[i] - b[i];
497 }
498}
499
503template< typename T >
505 const T * __restrict__ b,
506 const T * __restrict__ c,
507 const int n) {
508
509 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
510 const int str = blockDim.x * gridDim.x;
511
512 for (int i = idx; i < n; i += str) {
513 a[i] = b[i] - c[i];
514 }
515}
516
520template< typename T >
522 const T * __restrict__ b,
523 const T * __restrict__ c,
524 const int n) {
525
526 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
527 const int str = blockDim.x * gridDim.x;
528
529 for (int i = idx; i < n; i += str) {
530 a[i] = a[i] + b[i] * c[i];
531 }
532
533}
534
538template< typename T >
540 const T * __restrict__ b,
541 const T * __restrict__ c,
542 const T * __restrict__ d,
543 const int n) {
544
545 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
546 const int str = blockDim.x * gridDim.x;
547
548 for (int i = idx; i < n; i += str) {
549 a[i] = a[i] + b[i] * c[i] * d[i];
550 }
551
552}
553
557template< typename T >
559 const T * __restrict__ u1,
560 const T * __restrict__ u2,
561 const T * __restrict__ u3,
562 const T * __restrict__ v1,
563 const T * __restrict__ v2,
564 const T * __restrict__ v3,
565 const int n) {
566
567 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
568 const int str = blockDim.x * gridDim.x;
569
570 for (int i = idx; i < n; i += str) {
571 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
572 }
573
574}
575
579template< typename T >
581 T * __restrict__ u2,
582 T * __restrict__ u3,
583 const T * __restrict__ v1,
584 const T * __restrict__ v2,
585 const T * __restrict__ v3,
586 const T * __restrict__ w1,
587 const T * __restrict__ w2,
588 const T * __restrict__ w3,
589 const int n) {
590
591 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
592 const int str = blockDim.x * gridDim.x;
593
594 for (int i = idx; i < n; i += str) {
595 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
596 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
597 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
598 }
599}
600
604template< typename T>
606 val += __shfl_down(val, 32);
607 val += __shfl_down(val, 16);
608 val += __shfl_down(val, 8);
609 val += __shfl_down(val, 4);
610 val += __shfl_down(val, 2);
611 val += __shfl_down(val, 1);
612 return val;
613}
614
618template< typename T >
619__global__ void reduce_kernel(T * bufred, const int n) {
620
621 T sum = 0;
622 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
623 const int str = blockDim.x * gridDim.x;
624 for (int i = idx; i<n ; i += str)
625 {
626 sum += bufred[i];
627 }
628
629 __shared__ T shared[64];
630 unsigned int lane = threadIdx.x % warpSize;
631 unsigned int wid = threadIdx.x / warpSize;
632
634 if (lane == 0)
635 shared[wid] = sum;
637
638 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
639 if (wid == 0)
641
642 if (threadIdx.x == 0)
643 bufred[blockIdx.x] = sum;
644}
645
650template< typename T >
652 const int n,
653 const int j
654 ) {
655 __shared__ T buf[1024] ;
656 const int idx = threadIdx.x;
657 const int y= blockIdx.x;
658 const int step = blockDim.x;
659
660 buf[idx]=0;
661 for (int i=idx ; i<n ; i+=step)
662 {
663 buf[idx] += bufred[i*j + y];
664 }
666
667 int i = 512;
668 while (i != 0)
669 {
670 if(threadIdx.x < i && (threadIdx.x + i) < n )
671 {
672 buf[threadIdx.x] += buf[threadIdx.x + i] ;
673 }
674 i = i>>1;
676 }
677
678 bufred[y] = buf[0];
679}
680
684template< typename T >
686 const T * b,
687 const T * c,
688 T * buf_h,
689 const int n) {
690
691 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
692 const int str = blockDim.x * gridDim.x;
693
694 const unsigned int lane = threadIdx.x % warpSize;
695 const unsigned int wid = threadIdx.x / warpSize;
696
697 __shared__ T shared[64];
698 T sum = 0.0;
699 for (int i = idx; i < n; i+= str) {
700 sum += a[i] * b[i] * c[i];
701 }
702
704 if (lane == 0)
705 shared[wid] = sum;
707
708 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
709 if (wid == 0)
711
712 if (threadIdx.x == 0)
713 buf_h[blockIdx.x] = sum;
714}
715
719template< typename T >
721 const T ** b,
722 const T * c,
723 T * buf_h,
724 const int j,
725 const int n) {
726
727 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
728 const int str = blockDim.y * gridDim.x;
729 const int y = threadIdx.x;
730
731 __shared__ T buf[1024];
732 T tmp = 0;
733 if(y < j){
734 for (int i = idx; i < n; i+= str) {
735 tmp += a[i] * b[threadIdx.x][i] * c[i];
736 }
737 }
738
739 buf[threadIdx.y*blockDim.x+y] = tmp;
741
742 int i = blockDim.y>>1;
743 while (i != 0) {
744 if (threadIdx.y < i) {
745 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
746 }
748 i = i>>1;
749 }
750 if (threadIdx.y == 0) {
751 if( y < j) {
752 buf_h[j*blockIdx.x+y] = buf[y];
753 }
754 }
755}
756
760template< typename T >
762 const T * b,
763 T * buf_h,
764 const int n) {
765
766 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
767 const int str = blockDim.x * gridDim.x;
768
769 const unsigned int lane = threadIdx.x % warpSize;
770 const unsigned int wid = threadIdx.x / warpSize;
771
772 __shared__ T shared[64];
773 T sum = 0.0;
774 for (int i = idx; i < n; i+= str) {
775 sum += a[i] * b[i];
776 }
777
779 if (lane == 0)
780 shared[wid] = sum;
782
783 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
784 if (wid == 0)
786
787 if (threadIdx.x == 0)
788 buf_h[blockIdx.x] = sum;
789
790}
791
795template< typename T >
797 const T * b,
798 T * buf_h,
799 const int n) {
800
801 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
802 const int str = blockDim.x * gridDim.x;
803
804 const unsigned int lane = threadIdx.x % warpSize;
805 const unsigned int wid = threadIdx.x / warpSize;
806
807 __shared__ T shared[64];
808 T sum = 0.0;
809 for (int i = idx; i < n; i+= str) {
810 sum += pow(a[i] - b[i], 2.0);
811 }
812
814 if (lane == 0)
815 shared[wid] = sum;
817
818 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
819 if (wid == 0)
821
822 if (threadIdx.x == 0)
823 buf_h[blockIdx.x] = sum;
824
825}
826
830template< typename T >
832 T * buf_h,
833 const int n) {
834
835 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
836 const int str = blockDim.x * gridDim.x;
837
838 const unsigned int lane = threadIdx.x % warpSize;
839 const unsigned int wid = threadIdx.x / warpSize;
840
841 __shared__ T shared[64];
842 T sum = 0;
843 for (int i = idx; i<n ; i += str)
844 {
845 sum += a[i];
846 }
847
849 if (lane == 0)
850 shared[wid] = sum;
852
853 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
854 if (wid == 0)
856
857 if (threadIdx.x == 0)
858 buf_h[blockIdx.x] = sum;
859
860}
861
865template< typename T >
867 const int n) {
868
869 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
870 const int str = blockDim.x * gridDim.x;
871
872 for (int i = idx; i < n; i += str) {
873 a[i] = fabs(a[i]);
874 }
875}
876
877// ========================================================================== //
878// Kernels for the point-wise operations
879
884template <typename T>
885__global__ void
886 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
887
888 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
889 const int str = blockDim.x * gridDim.x;
890
891 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
892}
893
898template <typename T>
900 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
901 const int n) {
902
903 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
904 const int str = blockDim.x * gridDim.x;
905
906 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
907}
908
913template <typename T>
914__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
915
916 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
917 const int str = blockDim.x * gridDim.x;
918
919 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
920}
921
926template <typename T>
928 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
929
930 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
931 const int str = blockDim.x * gridDim.x;
932
933 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
934}
935
940template <typename T>
941__global__ void
942 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
943
944 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
945 const int str = blockDim.x * gridDim.x;
946
947 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
948}
949
954template <typename T>
956 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
957 const int n) {
958
959 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
960 const int str = blockDim.x * gridDim.x;
961
962 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
963}
964
969template <typename T>
970__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
971
972 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
973 const int str = blockDim.x * gridDim.x;
974
975 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
976}
977
982template <typename T>
984 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
985
986 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
987 const int str = blockDim.x * gridDim.x;
988
989 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
990}
991
992#endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:95
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:57
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
Definition math_kernel.h:76
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:549
Object for handling masks in Neko.
Definition mask.f90:34
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40