Neko  0.9.99
A portable framework for high-order spectral element flow simulations
math_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T >
41 __global__ void cmult_kernel(T * __restrict__ a,
42  const T c,
43  const int n) {
44 
45  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46  const int str = blockDim.x * gridDim.x;
47 
48  for (int i = idx; i < n; i += str) {
49  a[i] = c * a[i];
50  }
51 }
52 
53 
57 template< typename T >
58 __global__ void masked_red_copy_kernel(T * __restrict__ a,
59  T * __restrict__ b,
60  int * __restrict__ mask,
61  const int n,
62  const int m) {
63 
64  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
65  const int str = blockDim.x * gridDim.x;
66 
67  for (int i = idx; i < m; i += str) {
68  a[i] = b[mask[i+1]-1];
69  }
70 }
71 
75 template< typename T >
76 __global__ void masked_copy_kernel(T * __restrict__ a,
77  T * __restrict__ b,
78  int * __restrict__ mask,
79  const int n,
80  const int m) {
81 
82  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83  const int str = blockDim.x * gridDim.x;
84 
85  for (int i = idx; i < m; i += str) {
86  a[mask[i+1]-1] = b[mask[i+1]-1];
87  }
88 }
89 
93 template <typename T>
94 __global__ void cfill_mask_kernel(T* __restrict__ a,
95  const T c,
96  const int size,
97  int* __restrict__ mask,
98  const int mask_size) {
99 
100  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
101  const int str = blockDim.x * gridDim.x;
102 
103  for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
104 }
105 
109 template< typename T >
110 __global__ void cmult2_kernel(T * __restrict__ a,
111  T * __restrict__ b,
112  const T c,
113  const int n) {
114 
115  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
116  const int str = blockDim.x * gridDim.x;
117 
118  for (int i = idx; i < n; i += str) {
119  a[i] = c * b[i];
120  }
121 }
122 
126 template< typename T >
127 __global__ void cadd_kernel(T * __restrict__ a,
128  const T c,
129  const int n) {
130 
131  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
132  const int str = blockDim.x * gridDim.x;
133 
134  for (int i = idx; i < n; i += str) {
135  a[i] = a[i] + c;
136  }
137 }
138 
142 template< typename T >
143 __global__ void cadd2_kernel(T * __restrict__ a,
144  T * __restrict__ b,
145  const T c,
146  const int n) {
147 
148  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
149  const int str = blockDim.x * gridDim.x;
150 
151  for (int i = idx; i < n; i += str) {
152  a[i] = b[i] + c;
153  }
154 }
155 
159 template< typename T >
160 __global__ void cfill_kernel(T * __restrict__ a,
161  const T c,
162  const int n) {
163 
164  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
165  const int str = blockDim.x * gridDim.x;
166 
167  for (int i = idx; i < n; i += str) {
168  a[i] = c;
169  }
170 }
171 
175 template< typename T >
176 __global__ void add2_kernel(T * __restrict__ a,
177  const T * __restrict__ b,
178  const int n) {
179 
180  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
181  const int str = blockDim.x * gridDim.x;
182 
183  for (int i = idx; i < n; i += str) {
184  a[i] = a[i] + b[i];
185  }
186 }
187 
191 template< typename T >
192 __global__ void add3_kernel(T * __restrict__ a,
193  const T * __restrict__ b,
194  const T * __restrict__ c,
195  const int n) {
196 
197  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
198  const int str = blockDim.x * gridDim.x;
199 
200  for (int i = idx; i < n; i += str) {
201  a[i] = b[i] + c[i];
202  }
203 }
204 
208 template< typename T >
209 __global__ void add4_kernel(T * __restrict__ a,
210  const T * __restrict__ b,
211  const T * __restrict__ c,
212  const T * __restrict__ d,
213  const int n) {
214 
215  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216  const int str = blockDim.x * gridDim.x;
217 
218  for (int i = idx; i < n; i += str) {
219  a[i] = b[i] + c[i] + d[i];
220  }
221 }
222 
226 template< typename T >
227 __global__ void add2s1_kernel(T * __restrict__ a,
228  const T * __restrict__ b,
229  const T c1,
230  const int n) {
231 
232  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
233  const int str = blockDim.x * gridDim.x;
234 
235  for (int i = idx; i < n; i += str) {
236  a[i] = c1 * a[i] + b[i];
237  }
238 }
239 
243 template< typename T >
244 __global__ void add2s2_many_kernel(T * __restrict__ x,
245  const T ** p,
246  const T * alpha,
247  const int p_cur,
248  const int n) {
249 
250  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
251  const int str = blockDim.x * gridDim.x;
252 
253 
254  for (int i = idx; i < n; i+= str) {
255  T tmp = 0.0;
256  for (int j = 0; j < p_cur; j ++) {
257  tmp += p[j][i]*alpha[j];
258  }
259  x[i] += tmp;
260  }
261 }
262 
266 template< typename T >
267 __global__ void add2s2_kernel(T * __restrict__ a,
268  const T * __restrict__ b,
269  const T c1,
270  const int n) {
271 
272  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
273  const int str = blockDim.x * gridDim.x;
274 
275  for (int i = idx; i < n; i += str) {
276  a[i] = a[i] + c1 * b[i];
277  }
278 }
279 
283 template< typename T >
284 __global__ void addsqr2s2_kernel(T * __restrict__ a,
285  const T * __restrict__ b,
286  const T c1,
287  const int n) {
288 
289  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
290  const int str = blockDim.x * gridDim.x;
291 
292  for (int i = idx; i < n; i += str) {
293  a[i] = a[i] + c1 * (b[i] * b[i]);
294  }
295 }
296 
300 template< typename T >
301 __global__ void add3s2_kernel(T * __restrict__ a,
302  const T * __restrict__ b,
303  const T * __restrict__ c,
304  const T c1,
305  const T c2,
306  const int n) {
307 
308  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
309  const int str = blockDim.x * gridDim.x;
310 
311  for (int i = idx; i < n; i += str) {
312  a[i] = c1 * b[i] + c2 * c[i];
313  }
314 }
315 
319 template< typename T >
320 __global__ void invcol1_kernel(T * __restrict__ a,
321  const int n) {
322 
323  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324  const int str = blockDim.x * gridDim.x;
325  const T one = 1.0;
326 
327  for (int i = idx; i < n; i += str) {
328  a[i] = one / a[i];
329  }
330 }
331 
335 template< typename T >
336 __global__ void invcol2_kernel(T * __restrict__ a,
337  const T * __restrict__ b,
338  const int n) {
339 
340  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
341  const int str = blockDim.x * gridDim.x;
342 
343  for (int i = idx; i < n; i += str) {
344  a[i] = a[i] / b[i];
345  }
346 }
347 
351 template< typename T >
352 __global__ void col2_kernel(T * __restrict__ a,
353  const T * __restrict__ b,
354  const int n) {
355 
356  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
357  const int str = blockDim.x * gridDim.x;
358 
359  for (int i = idx; i < n; i += str) {
360  a[i] = a[i] * b[i];
361  }
362 }
363 
367 template< typename T >
368 __global__ void col3_kernel(T * __restrict__ a,
369  const T * __restrict__ b,
370  const T * __restrict__ c,
371  const int n) {
372 
373  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
374  const int str = blockDim.x * gridDim.x;
375 
376  for (int i = idx; i < n; i += str) {
377  a[i] = b[i] * c[i];
378  }
379 }
380 
384 template< typename T >
385 __global__ void subcol3_kernel(T * __restrict__ a,
386  const T * __restrict__ b,
387  const T * __restrict__ c,
388  const int n) {
389 
390  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
391  const int str = blockDim.x * gridDim.x;
392 
393  for (int i = idx; i < n; i += str) {
394  a[i] = a[i] - b[i] * c[i];
395  }
396 }
397 
401 template< typename T >
402 __global__ void sub2_kernel(T * __restrict__ a,
403  const T * __restrict__ b,
404  const int n) {
405 
406  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
407  const int str = blockDim.x * gridDim.x;
408 
409  for (int i = idx; i < n; i += str) {
410  a[i] = a[i] - b[i];
411  }
412 }
413 
417 template< typename T >
418 __global__ void sub3_kernel(T * __restrict__ a,
419  const T * __restrict__ b,
420  const T * __restrict__ c,
421  const int n) {
422 
423  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
424  const int str = blockDim.x * gridDim.x;
425 
426  for (int i = idx; i < n; i += str) {
427  a[i] = b[i] - c[i];
428  }
429 }
430 
434 template< typename T >
435 __global__ void addcol3_kernel(T * __restrict__ a,
436  const T * __restrict__ b,
437  const T * __restrict__ c,
438  const int n) {
439 
440  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
441  const int str = blockDim.x * gridDim.x;
442 
443  for (int i = idx; i < n; i += str) {
444  a[i] = a[i] + b[i] * c[i];
445  }
446 
447 }
448 
452 template< typename T >
453 __global__ void addcol4_kernel(T * __restrict__ a,
454  const T * __restrict__ b,
455  const T * __restrict__ c,
456  const T * __restrict__ d,
457  const int n) {
458 
459  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
460  const int str = blockDim.x * gridDim.x;
461 
462  for (int i = idx; i < n; i += str) {
463  a[i] = a[i] + b[i] * c[i] * d[i];
464  }
465 
466 }
467 
471 template< typename T >
472 __global__ void vdot3_kernel(T * __restrict__ dot,
473  const T * __restrict__ u1,
474  const T * __restrict__ u2,
475  const T * __restrict__ u3,
476  const T * __restrict__ v1,
477  const T * __restrict__ v2,
478  const T * __restrict__ v3,
479  const int n) {
480 
481  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
482  const int str = blockDim.x * gridDim.x;
483 
484  for (int i = idx; i < n; i += str) {
485  dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
486  }
487 
488 }
489 
493 template< typename T >
494 __global__ void vcross_kernel(T * __restrict__ u1,
495  T * __restrict__ u2,
496  T * __restrict__ u3,
497  const T * __restrict__ v1,
498  const T * __restrict__ v2,
499  const T * __restrict__ v3,
500  const T * __restrict__ w1,
501  const T * __restrict__ w2,
502  const T * __restrict__ w3,
503  const int n) {
504 
505  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
506  const int str = blockDim.x * gridDim.x;
507 
508  for (int i = idx; i < n; i += str) {
509  u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
510  u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
511  u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
512  }
513 
514 }
515 
516 
520 template< typename T>
521 __inline__ __device__ T reduce_warp(T val) {
522  val += __shfl_down_sync(0xffffffff, val, 16);
523  val += __shfl_down_sync(0xffffffff, val, 8);
524  val += __shfl_down_sync(0xffffffff, val, 4);
525  val += __shfl_down_sync(0xffffffff, val, 2);
526  val += __shfl_down_sync(0xffffffff, val, 1);
527  return val;
528 }
529 
533 template< typename T >
534 __global__ void reduce_kernel(T * bufred, const int n) {
535 
536  T sum = 0;
537  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
538  const int str = blockDim.x * gridDim.x;
539  for (int i = idx; i<n ; i += str)
540  {
541  sum += bufred[i];
542  }
543 
544  __shared__ T shared[32];
545  unsigned int lane = threadIdx.x % warpSize;
546  unsigned int wid = threadIdx.x / warpSize;
547 
548  sum = reduce_warp<T>(sum);
549  if (lane == 0)
550  shared[wid] = sum;
551  __syncthreads();
552 
553  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
554  if (wid == 0)
555  sum = reduce_warp<T>(sum);
556 
557  if (threadIdx.x == 0)
558  bufred[blockIdx.x] = sum;
559 }
560 
565 template< typename T >
566 __global__ void glsc3_reduce_kernel( T * bufred,
567  const int n,
568  const int j
569  ) {
570  __shared__ T buf[1024] ;
571  const int idx = threadIdx.x;
572  const int y= blockIdx.x;
573  const int step = blockDim.x;
574 
575  buf[idx]=0;
576  for (int i=idx ; i<n ; i+=step)
577  {
578  buf[idx] += bufred[i*j + y];
579  }
580  __syncthreads();
581 
582  int i = 512;
583  while (i != 0)
584  {
585  if(threadIdx.x < i && (threadIdx.x + i) < n )
586  {
587  buf[threadIdx.x] += buf[threadIdx.x + i] ;
588  }
589  i = i>>1;
590  __syncthreads();
591  }
592 
593  bufred[y] = buf[0];
594 }
595 
596 
600 template< typename T >
601 __global__ void glsc3_kernel(const T * a,
602  const T * b,
603  const T * c,
604  T * buf_h,
605  const int n) {
606 
607  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608  const int str = blockDim.x * gridDim.x;
609 
610  const unsigned int lane = threadIdx.x % warpSize;
611  const unsigned int wid = threadIdx.x / warpSize;
612 
613  __shared__ T shared[32];
614  T sum = 0.0;
615  for (int i = idx; i < n; i+= str) {
616  sum += a[i] * b[i] * c[i];
617  }
618 
619  sum = reduce_warp<T>(sum);
620  if (lane == 0)
621  shared[wid] = sum;
622  __syncthreads();
623 
624  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
625  if (wid == 0)
626  sum = reduce_warp<T>(sum);
627 
628  if (threadIdx.x == 0)
629  buf_h[blockIdx.x] = sum;
630 }
631 
635 template< typename T >
636 __global__ void glsc3_many_kernel(const T * a,
637  const T ** b,
638  const T * c,
639  T * buf_h,
640  const int j,
641  const int n) {
642 
643  const int idx = blockIdx.x * blockDim.y + threadIdx.y;
644  const int str = blockDim.y * gridDim.x;
645  const int y = threadIdx.x;
646 
647  __shared__ T buf[1024];
648  T tmp = 0;
649  if(y < j){
650  for (int i = idx; i < n; i+= str) {
651  tmp += a[i] * b[threadIdx.x][i] * c[i];
652  }
653  }
654 
655  buf[threadIdx.y*blockDim.x+y] = tmp;
656  __syncthreads();
657 
658  int i = blockDim.y>>1;
659  while (i != 0) {
660  if (threadIdx.y < i) {
661  buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
662  }
663  __syncthreads();
664  i = i>>1;
665  }
666  if (threadIdx.y == 0) {
667  if( y < j) {
668  buf_h[j*blockIdx.x+y] = buf[y];
669  }
670  }
671 }
672 
676 template< typename T >
677 __global__ void glsc2_kernel(const T * a,
678  const T * b,
679  T * buf_h,
680  const int n) {
681 
682  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
683  const int str = blockDim.x * gridDim.x;
684 
685  const unsigned int lane = threadIdx.x % warpSize;
686  const unsigned int wid = threadIdx.x / warpSize;
687 
688  __shared__ T shared[32];
689  T sum = 0.0;
690  for (int i = idx; i < n; i+= str) {
691  sum += a[i] * b[i];
692  }
693 
694  sum = reduce_warp<T>(sum);
695  if (lane == 0)
696  shared[wid] = sum;
697  __syncthreads();
698 
699  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
700  if (wid == 0)
701  sum = reduce_warp<T>(sum);
702 
703  if (threadIdx.x == 0)
704  buf_h[blockIdx.x] = sum;
705 
706 }
707 
711 template< typename T >
712 __global__ void glsum_kernel(const T * a,
713  T * buf_h,
714  const int n) {
715 
716  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
717  const int str = blockDim.x * gridDim.x;
718 
719  const unsigned int lane = threadIdx.x % warpSize;
720  const unsigned int wid = threadIdx.x / warpSize;
721 
722  __shared__ T shared[32];
723  T sum = 0;
724  for (int i = idx; i<n ; i += str)
725  {
726  sum += a[i];
727  }
728 
729  sum = reduce_warp<T>(sum);
730  if (lane == 0)
731  shared[wid] = sum;
732  __syncthreads();
733 
734  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
735  if (wid == 0)
736  sum = reduce_warp<T>(sum);
737 
738  if (threadIdx.x == 0)
739  buf_h[blockIdx.x] = sum;
740 
741 }
742 
746 template< typename T >
747 __global__ void absval_kernel(T * __restrict__ a,
748  const int n) {
749 
750  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
751  const int str = blockDim.x * gridDim.x;
752 
753  for (int i = idx; i < n; i += str) {
754  a[i] = fabs(a[i]);
755  }
756 }
757 
758 // ========================================================================== //
759 // Kernels for the point-wise operations
760 
765 template <typename T>
766 __global__ void
767  pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
768 
769  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
770  const int str = blockDim.x * gridDim.x;
771 
772  for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
773 }
774 
779 template <typename T>
780 __global__ void pwmax_vec3_kernel(
781  T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
782  const int n) {
783 
784  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
785  const int str = blockDim.x * gridDim.x;
786 
787  for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
788 }
789 
794 template <typename T>
795 __global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
796 
797  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
798  const int str = blockDim.x * gridDim.x;
799 
800  for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
801 }
802 
807 template <typename T>
808 __global__ void pwmax_sca3_kernel(
809  T* __restrict__ a, const T* __restrict b, const T c, const int n) {
810 
811  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
812  const int str = blockDim.x * gridDim.x;
813 
814  for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
815 }
816 
821 template <typename T>
822 __global__ void
823  pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
824 
825  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
826  const int str = blockDim.x * gridDim.x;
827 
828  for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
829 }
830 
835 template <typename T>
836 __global__ void pwmin_vec3_kernel(
837  T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
838  const int n) {
839 
840  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
841  const int str = blockDim.x * gridDim.x;
842 
843  for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
844 }
845 
850 template <typename T>
851 __global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
852 
853  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
854  const int str = blockDim.x * gridDim.x;
855 
856  for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
857 }
858 
863 template <typename T>
864 __global__ void pwmin_sca3_kernel(
865  T* __restrict__ a, const T* __restrict b, const T c, const int n) {
866 
867  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
868  const int str = blockDim.x * gridDim.x;
869 
870  for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
871 }
872 
873 #endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:453
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:836
__global__ void reduce_kernel(T *bufred, const int n)
Definition: math_kernel.h:534
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:336
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:176
__inline__ __device__ T reduce_warp(T val)
Definition: math_kernel.h:521
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:780
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:76
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
Definition: math_kernel.h:636
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
Definition: math_kernel.h:94
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
Definition: math_kernel.h:566
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:795
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:823
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
Definition: math_kernel.h:301
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:227
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
Definition: math_kernel.h:244
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:767
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:435
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
Definition: math_kernel.h:864
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
Definition: math_kernel.h:808
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:352
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:368
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:402
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition: math_kernel.h:677
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:110
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:851
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:418
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition: math_kernel.h:712
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
Definition: math_kernel.h:601
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:267
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
Definition: math_kernel.h:472
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:320
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:192
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:209
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:160
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:58
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
Definition: math_kernel.h:494
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:284
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:143
__global__ void absval_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:747
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:385
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:127
real * bufred
Definition: math.cu:479
real * buf
Definition: pipecg_aux.cu:42
#define max(a, b)
Definition: tensor.cu:40