Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
53
57template< typename T >
59 T * __restrict__ b,
60 int * __restrict__ mask,
61 const int n,
62 const int m) {
63
64 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
65 const int str = blockDim.x * gridDim.x;
66
67 for (int i = idx; i < m; i += str) {
68 a[i] = b[mask[i+1]-1];
69 }
70}
71
75template< typename T >
77 T * __restrict__ b,
78 int * __restrict__ mask,
79 const int n,
80 const int m) {
81
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
84
85 for (int i = idx; i < m; i += str) {
86 a[mask[i+1]-1] = b[mask[i+1]-1];
87 }
88}
89
93template <typename T>
95 const T c,
96 const int size,
97 int* __restrict__ mask,
98 const int mask_size) {
99
100 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
101 const int str = blockDim.x * gridDim.x;
102
103 for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
104}
105
109template< typename T >
111 T * __restrict__ b,
112 const T c,
113 const int n) {
114
115 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
116 const int str = blockDim.x * gridDim.x;
117
118 for (int i = idx; i < n; i += str) {
119 a[i] = c * b[i];
120 }
121}
122
126template< typename T >
128 const T c,
129 const int n) {
130
131 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
132 const int str = blockDim.x * gridDim.x;
133
134 for (int i = idx; i < n; i += str) {
135 a[i] = a[i] + c;
136 }
137}
138
142template< typename T >
144 T * __restrict__ b,
145 const T c,
146 const int n) {
147
148 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
149 const int str = blockDim.x * gridDim.x;
150
151 for (int i = idx; i < n; i += str) {
152 a[i] = b[i] + c;
153 }
154}
155
159template< typename T >
161 const T c,
162 const int n) {
163
164 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
165 const int str = blockDim.x * gridDim.x;
166
167 for (int i = idx; i < n; i += str) {
168 a[i] = c;
169 }
170}
171
175template< typename T >
177 const T * __restrict__ b,
178 const int n) {
179
180 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
181 const int str = blockDim.x * gridDim.x;
182
183 for (int i = idx; i < n; i += str) {
184 a[i] = a[i] + b[i];
185 }
186}
187
191template< typename T >
193 const T * __restrict__ b,
194 const T * __restrict__ c,
195 const int n) {
196
197 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
198 const int str = blockDim.x * gridDim.x;
199
200 for (int i = idx; i < n; i += str) {
201 a[i] = b[i] + c[i];
202 }
203}
204
208template< typename T >
210 const T * __restrict__ b,
211 const T * __restrict__ c,
212 const T * __restrict__ d,
213 const int n) {
214
215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216 const int str = blockDim.x * gridDim.x;
217
218 for (int i = idx; i < n; i += str) {
219 a[i] = b[i] + c[i] + d[i];
220 }
221}
222
226template< typename T >
228 const T * __restrict__ b,
229 const T c1,
230 const int n) {
231
232 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
233 const int str = blockDim.x * gridDim.x;
234
235 for (int i = idx; i < n; i += str) {
236 a[i] = c1 * a[i] + b[i];
237 }
238}
239
243template< typename T >
245 const T ** p,
246 const T * alpha,
247 const int p_cur,
248 const int n) {
249
250 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
251 const int str = blockDim.x * gridDim.x;
252
253
254 for (int i = idx; i < n; i+= str) {
255 T tmp = 0.0;
256 for (int j = 0; j < p_cur; j ++) {
257 tmp += p[j][i]*alpha[j];
258 }
259 x[i] += tmp;
260 }
261}
262
266template< typename T >
268 const T * __restrict__ b,
269 const T c1,
270 const int n) {
271
272 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
273 const int str = blockDim.x * gridDim.x;
274
275 for (int i = idx; i < n; i += str) {
276 a[i] = a[i] + c1 * b[i];
277 }
278}
279
283template< typename T >
285 const T * __restrict__ b,
286 const T c1,
287 const int n) {
288
289 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
290 const int str = blockDim.x * gridDim.x;
291
292 for (int i = idx; i < n; i += str) {
293 a[i] = a[i] + c1 * (b[i] * b[i]);
294 }
295}
296
300template< typename T >
302 const T * __restrict__ b,
303 const T * __restrict__ c,
304 const T c1,
305 const T c2,
306 const int n) {
307
308 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
309 const int str = blockDim.x * gridDim.x;
310
311 for (int i = idx; i < n; i += str) {
312 a[i] = c1 * b[i] + c2 * c[i];
313 }
314}
315
319template< typename T >
321 const int n) {
322
323 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 const int str = blockDim.x * gridDim.x;
325 const T one = 1.0;
326
327 for (int i = idx; i < n; i += str) {
328 a[i] = one / a[i];
329 }
330}
331
335template< typename T >
337 const T * __restrict__ b,
338 const int n) {
339
340 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
341 const int str = blockDim.x * gridDim.x;
342
343 for (int i = idx; i < n; i += str) {
344 a[i] = a[i] / b[i];
345 }
346}
347
351template< typename T >
353 const T * __restrict__ b,
354 const int n) {
355
356 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
357 const int str = blockDim.x * gridDim.x;
358
359 for (int i = idx; i < n; i += str) {
360 a[i] = a[i] * b[i];
361 }
362}
363
367template< typename T >
369 const T * __restrict__ b,
370 const T * __restrict__ c,
371 const int n) {
372
373 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
374 const int str = blockDim.x * gridDim.x;
375
376 for (int i = idx; i < n; i += str) {
377 a[i] = b[i] * c[i];
378 }
379}
380
384template< typename T >
386 const T * __restrict__ b,
387 const T * __restrict__ c,
388 const int n) {
389
390 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
391 const int str = blockDim.x * gridDim.x;
392
393 for (int i = idx; i < n; i += str) {
394 a[i] = a[i] - b[i] * c[i];
395 }
396}
397
401template< typename T >
403 const T * __restrict__ b,
404 const int n) {
405
406 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
407 const int str = blockDim.x * gridDim.x;
408
409 for (int i = idx; i < n; i += str) {
410 a[i] = a[i] - b[i];
411 }
412}
413
417template< typename T >
419 const T * __restrict__ b,
420 const T * __restrict__ c,
421 const int n) {
422
423 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
424 const int str = blockDim.x * gridDim.x;
425
426 for (int i = idx; i < n; i += str) {
427 a[i] = b[i] - c[i];
428 }
429}
430
434template< typename T >
436 const T * __restrict__ b,
437 const T * __restrict__ c,
438 const int n) {
439
440 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
441 const int str = blockDim.x * gridDim.x;
442
443 for (int i = idx; i < n; i += str) {
444 a[i] = a[i] + b[i] * c[i];
445 }
446
447}
448
452template< typename T >
454 const T * __restrict__ b,
455 const T * __restrict__ c,
456 const T * __restrict__ d,
457 const int n) {
458
459 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
460 const int str = blockDim.x * gridDim.x;
461
462 for (int i = idx; i < n; i += str) {
463 a[i] = a[i] + b[i] * c[i] * d[i];
464 }
465
466}
467
471template< typename T >
473 const T * __restrict__ u1,
474 const T * __restrict__ u2,
475 const T * __restrict__ u3,
476 const T * __restrict__ v1,
477 const T * __restrict__ v2,
478 const T * __restrict__ v3,
479 const int n) {
480
481 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
482 const int str = blockDim.x * gridDim.x;
483
484 for (int i = idx; i < n; i += str) {
485 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
486 }
487
488}
489
493template< typename T >
495 T * __restrict__ u2,
496 T * __restrict__ u3,
497 const T * __restrict__ v1,
498 const T * __restrict__ v2,
499 const T * __restrict__ v3,
500 const T * __restrict__ w1,
501 const T * __restrict__ w2,
502 const T * __restrict__ w3,
503 const int n) {
504
505 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
506 const int str = blockDim.x * gridDim.x;
507
508 for (int i = idx; i < n; i += str) {
509 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
510 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
511 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
512 }
513
514}
515
516
520template< typename T>
522 val += __shfl_down_sync(0xffffffff, val, 16);
523 val += __shfl_down_sync(0xffffffff, val, 8);
524 val += __shfl_down_sync(0xffffffff, val, 4);
525 val += __shfl_down_sync(0xffffffff, val, 2);
526 val += __shfl_down_sync(0xffffffff, val, 1);
527 return val;
528}
529
533template< typename T >
534__global__ void reduce_kernel(T * bufred, const int n) {
535
536 T sum = 0;
537 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
538 const int str = blockDim.x * gridDim.x;
539 for (int i = idx; i<n ; i += str)
540 {
541 sum += bufred[i];
542 }
543
544 __shared__ T shared[32];
545 unsigned int lane = threadIdx.x % warpSize;
546 unsigned int wid = threadIdx.x / warpSize;
547
549 if (lane == 0)
550 shared[wid] = sum;
552
553 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
554 if (wid == 0)
556
557 if (threadIdx.x == 0)
558 bufred[blockIdx.x] = sum;
559}
560
565template< typename T >
567 const int n,
568 const int j
569 ) {
570 __shared__ T buf[1024] ;
571 const int idx = threadIdx.x;
572 const int y= blockIdx.x;
573 const int step = blockDim.x;
574
575 buf[idx]=0;
576 for (int i=idx ; i<n ; i+=step)
577 {
578 buf[idx] += bufred[i*j + y];
579 }
581
582 int i = 512;
583 while (i != 0)
584 {
585 if(threadIdx.x < i && (threadIdx.x + i) < n )
586 {
587 buf[threadIdx.x] += buf[threadIdx.x + i] ;
588 }
589 i = i>>1;
591 }
592
593 bufred[y] = buf[0];
594}
595
596
600template< typename T >
602 const T * b,
603 const T * c,
604 T * buf_h,
605 const int n) {
606
607 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608 const int str = blockDim.x * gridDim.x;
609
610 const unsigned int lane = threadIdx.x % warpSize;
611 const unsigned int wid = threadIdx.x / warpSize;
612
613 __shared__ T shared[32];
614 T sum = 0.0;
615 for (int i = idx; i < n; i+= str) {
616 sum += a[i] * b[i] * c[i];
617 }
618
620 if (lane == 0)
621 shared[wid] = sum;
623
624 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
625 if (wid == 0)
627
628 if (threadIdx.x == 0)
629 buf_h[blockIdx.x] = sum;
630}
631
635template< typename T >
637 const T ** b,
638 const T * c,
639 T * buf_h,
640 const int j,
641 const int n) {
642
643 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
644 const int str = blockDim.y * gridDim.x;
645 const int y = threadIdx.x;
646
647 __shared__ T buf[1024];
648 T tmp = 0;
649 if(y < j){
650 for (int i = idx; i < n; i+= str) {
651 tmp += a[i] * b[threadIdx.x][i] * c[i];
652 }
653 }
654
655 buf[threadIdx.y*blockDim.x+y] = tmp;
657
658 int i = blockDim.y>>1;
659 while (i != 0) {
660 if (threadIdx.y < i) {
661 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
662 }
664 i = i>>1;
665 }
666 if (threadIdx.y == 0) {
667 if( y < j) {
668 buf_h[j*blockIdx.x+y] = buf[y];
669 }
670 }
671}
672
676template< typename T >
678 const T * b,
679 T * buf_h,
680 const int n) {
681
682 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
683 const int str = blockDim.x * gridDim.x;
684
685 const unsigned int lane = threadIdx.x % warpSize;
686 const unsigned int wid = threadIdx.x / warpSize;
687
688 __shared__ T shared[32];
689 T sum = 0.0;
690 for (int i = idx; i < n; i+= str) {
691 sum += a[i] * b[i];
692 }
693
695 if (lane == 0)
696 shared[wid] = sum;
698
699 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
700 if (wid == 0)
702
703 if (threadIdx.x == 0)
704 buf_h[blockIdx.x] = sum;
705
706}
707
711template< typename T >
713 T * buf_h,
714 const int n) {
715
716 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
717 const int str = blockDim.x * gridDim.x;
718
719 const unsigned int lane = threadIdx.x % warpSize;
720 const unsigned int wid = threadIdx.x / warpSize;
721
722 __shared__ T shared[32];
723 T sum = 0;
724 for (int i = idx; i<n ; i += str)
725 {
726 sum += a[i];
727 }
728
730 if (lane == 0)
731 shared[wid] = sum;
733
734 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
735 if (wid == 0)
737
738 if (threadIdx.x == 0)
739 buf_h[blockIdx.x] = sum;
740
741}
742
746template< typename T >
748 const int n) {
749
750 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
751 const int str = blockDim.x * gridDim.x;
752
753 for (int i = idx; i < n; i += str) {
754 a[i] = fabs(a[i]);
755 }
756}
757
758// ========================================================================== //
759// Kernels for the point-wise operations
760
765template <typename T>
766__global__ void
767 pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
768
769 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
770 const int str = blockDim.x * gridDim.x;
771
772 for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
773}
774
779template <typename T>
781 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
782 const int n) {
783
784 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
785 const int str = blockDim.x * gridDim.x;
786
787 for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
788}
789
794template <typename T>
795__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
796
797 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
798 const int str = blockDim.x * gridDim.x;
799
800 for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
801}
802
807template <typename T>
809 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
810
811 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
812 const int str = blockDim.x * gridDim.x;
813
814 for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
815}
816
821template <typename T>
822__global__ void
823 pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
824
825 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
826 const int str = blockDim.x * gridDim.x;
827
828 for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
829}
830
835template <typename T>
837 T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
838 const int n) {
839
840 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
841 const int str = blockDim.x * gridDim.x;
842
843 for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
844}
845
850template <typename T>
851__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
852
853 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
854 const int str = blockDim.x * gridDim.x;
855
856 for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
857}
858
863template <typename T>
865 T* __restrict__ a, const T* __restrict b, const T c, const int n) {
866
867 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
868 const int str = blockDim.x * gridDim.x;
869
870 for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
871}
872
873#endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:76
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
Definition math_kernel.h:94
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:58
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:479
real * buf
Definition pipecg_aux.cu:42
#define max(a, b)
Definition tensor.cu:40