Neko  0.9.99
A portable framework for high-order spectral element flow simulations
math_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T >
41 __global__ void cmult_kernel(T * __restrict__ a,
42  const T c,
43  const int n) {
44 
45  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46  const int str = blockDim.x * gridDim.x;
47 
48  for (int i = idx; i < n; i += str) {
49  a[i] = c * a[i];
50  }
51 }
52 
56 template< typename T >
57 __global__ void masked_copy_kernel(T * __restrict__ a,
58  T * __restrict__ b,
59  int * __restrict__ mask,
60  const int n,
61  const int m) {
62 
63  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64  const int str = blockDim.x * gridDim.x;
65 
66  for (int i = idx; i < m; i += str) {
67  a[mask[i+1]-1] = b[mask[i+1]-1];
68  }
69 }
70 
74 template< typename T >
75 __global__ void masked_red_copy_kernel(T * __restrict__ a,
76  T * __restrict__ b,
77  int * __restrict__ mask,
78  const int n,
79  const int m) {
80 
81  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82  const int str = blockDim.x * gridDim.x;
83 
84  for (int i = idx; i < m; i += str) {
85  a[i] = b[mask[i+1]-1];
86  }
87 }
88 
92 template< typename T >
93 __global__ void cfill_mask_kernel(T* __restrict__ a,
94  const T c,
95  const int size,
96  int* __restrict__ mask,
97  const int mask_size) {
98 
99  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100  const int str = blockDim.x * gridDim.x;
101 
102  for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
103 }
104 
108 template< typename T >
109 __global__ void cmult2_kernel(T * __restrict__ a,
110  T * __restrict__ b,
111  const T c,
112  const int n) {
113 
114  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
115  const int str = blockDim.x * gridDim.x;
116 
117  for (int i = idx; i < n; i += str) {
118  a[i] = c * b[i];
119  }
120 }
121 
125 template< typename T >
126 __global__ void cadd_kernel(T * __restrict__ a,
127  const T c,
128  const int n) {
129 
130  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
131  const int str = blockDim.x * gridDim.x;
132 
133  for (int i = idx; i < n; i += str) {
134  a[i] = a[i] + c;
135  }
136 }
137 
141 template< typename T >
142 __global__ void cadd2_kernel(T * __restrict__ a,
143  T * __restrict__ b,
144  const T c,
145  const int n) {
146 
147  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
148  const int str = blockDim.x * gridDim.x;
149 
150  for (int i = idx; i < n; i += str) {
151  a[i] = b[i] + c;
152  }
153 }
154 
158 template< typename T >
159 __global__ void cfill_kernel(T * __restrict__ a,
160  const T c,
161  const int n) {
162 
163  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
164  const int str = blockDim.x * gridDim.x;
165 
166  for (int i = idx; i < n; i += str) {
167  a[i] = c;
168  }
169 }
170 
174 template< typename T >
175 __global__ void add2_kernel(T * __restrict__ a,
176  const T * __restrict__ b,
177  const int n) {
178 
179  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
180  const int str = blockDim.x * gridDim.x;
181 
182  for (int i = idx; i < n; i += str) {
183  a[i] = a[i] + b[i];
184  }
185 }
186 
190 template< typename T >
191 __global__ void add3_kernel(T * __restrict__ a,
192  const T * __restrict__ b,
193  const T * __restrict__ c,
194  const int n) {
195 
196  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
197  const int str = blockDim.x * gridDim.x;
198 
199  for (int i = idx; i < n; i += str) {
200  a[i] = b[i] + c[i];
201  }
202 }
203 
207 template< typename T >
208 __global__ void add4_kernel(T * __restrict__ a,
209  const T * __restrict__ b,
210  const T * __restrict__ c,
211  const T * __restrict__ d,
212  const int n) {
213 
214  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
215  const int str = blockDim.x * gridDim.x;
216 
217  for (int i = idx; i < n; i += str) {
218  a[i] = b[i] + c[i] + d[i];
219  }
220 }
221 
225 template< typename T >
226 __global__ void add2s1_kernel(T * __restrict__ a,
227  const T * __restrict__ b,
228  const T c1,
229  const int n) {
230 
231  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
232  const int str = blockDim.x * gridDim.x;
233 
234  for (int i = idx; i < n; i += str) {
235  a[i] = c1 * a[i] + b[i];
236  }
237 }
238 
242 template< typename T >
243 __global__ void add2s2_many_kernel(T * __restrict__ x,
244  const T ** p,
245  const T * alpha,
246  const int p_cur,
247  const int n) {
248 
249  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
250  const int str = blockDim.x * gridDim.x;
251 
252 
253  for (int i = idx; i < n; i+= str) {
254  T tmp = 0.0;
255  for (int j = 0; j < p_cur; j ++) {
256  tmp += p[j][i]*alpha[j];
257  }
258  x[i] += tmp;
259  }
260 }
261 
265 template< typename T >
266 __global__ void add2s2_kernel(T * __restrict__ a,
267  const T * __restrict__ b,
268  const T c1,
269  const int n) {
270 
271  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272  const int str = blockDim.x * gridDim.x;
273 
274  for (int i = idx; i < n; i += str) {
275  a[i] = a[i] + c1 * b[i];
276  }
277 }
278 
282 template< typename T >
283 __global__ void addsqr2s2_kernel(T * __restrict__ a,
284  const T * __restrict__ b,
285  const T c1,
286  const int n) {
287 
288  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
289  const int str = blockDim.x * gridDim.x;
290 
291  for (int i = idx; i < n; i += str) {
292  a[i] = a[i] + c1 * (b[i] * b[i]);
293  }
294 }
295 
299 template< typename T >
300 __global__ void add3s2_kernel(T * __restrict__ a,
301  const T * __restrict__ b,
302  const T * __restrict__ c,
303  const T c1,
304  const T c2,
305  const int n) {
306 
307  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
308  const int str = blockDim.x * gridDim.x;
309 
310  for (int i = idx; i < n; i += str) {
311  a[i] = c1 * b[i] + c2 * c[i];
312  }
313 }
314 
318 template< typename T >
319 __global__ void invcol1_kernel(T * __restrict__ a,
320  const int n) {
321 
322  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
323  const int str = blockDim.x * gridDim.x;
324  const T one = 1.0;
325 
326  for (int i = idx; i < n; i += str) {
327  a[i] = one / a[i];
328  }
329 }
330 
334 template< typename T >
335 __global__ void invcol2_kernel(T * __restrict__ a,
336  const T * __restrict__ b,
337  const int n) {
338 
339  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
340  const int str = blockDim.x * gridDim.x;
341 
342  for (int i = idx; i < n; i += str) {
343  a[i] = a[i] / b[i];
344  }
345 }
346 
350 template< typename T >
351 __global__ void col2_kernel(T * __restrict__ a,
352  const T * __restrict__ b,
353  const int n) {
354 
355  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
356  const int str = blockDim.x * gridDim.x;
357 
358  for (int i = idx; i < n; i += str) {
359  a[i] = a[i] * b[i];
360  }
361 }
362 
366 template< typename T >
367 __global__ void col3_kernel(T * __restrict__ a,
368  const T * __restrict__ b,
369  const T * __restrict__ c,
370  const int n) {
371 
372  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
373  const int str = blockDim.x * gridDim.x;
374 
375  for (int i = idx; i < n; i += str) {
376  a[i] = b[i] * c[i];
377  }
378 }
379 
383 template< typename T >
384 __global__ void subcol3_kernel(T * __restrict__ a,
385  const T * __restrict__ b,
386  const T * __restrict__ c,
387  const int n) {
388 
389  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
390  const int str = blockDim.x * gridDim.x;
391 
392  for (int i = idx; i < n; i += str) {
393  a[i] = a[i] - b[i] * c[i];
394  }
395 }
396 
400 template< typename T >
401 __global__ void sub2_kernel(T * __restrict__ a,
402  const T * __restrict__ b,
403  const int n) {
404 
405  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
406  const int str = blockDim.x * gridDim.x;
407 
408  for (int i = idx; i < n; i += str) {
409  a[i] = a[i] - b[i];
410  }
411 }
412 
416 template< typename T >
417 __global__ void sub3_kernel(T * __restrict__ a,
418  const T * __restrict__ b,
419  const T * __restrict__ c,
420  const int n) {
421 
422  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
423  const int str = blockDim.x * gridDim.x;
424 
425  for (int i = idx; i < n; i += str) {
426  a[i] = b[i] - c[i];
427  }
428 }
429 
433 template< typename T >
434 __global__ void addcol3_kernel(T * __restrict__ a,
435  const T * __restrict__ b,
436  const T * __restrict__ c,
437  const int n) {
438 
439  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440  const int str = blockDim.x * gridDim.x;
441 
442  for (int i = idx; i < n; i += str) {
443  a[i] = a[i] + b[i] * c[i];
444  }
445 
446 }
447 
451 template< typename T >
452 __global__ void addcol4_kernel(T * __restrict__ a,
453  const T * __restrict__ b,
454  const T * __restrict__ c,
455  const T * __restrict__ d,
456  const int n) {
457 
458  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459  const int str = blockDim.x * gridDim.x;
460 
461  for (int i = idx; i < n; i += str) {
462  a[i] = a[i] + b[i] * c[i] * d[i];
463  }
464 
465 }
466 
470 template< typename T >
471 __global__ void vdot3_kernel(T * __restrict__ dot,
472  const T * __restrict__ u1,
473  const T * __restrict__ u2,
474  const T * __restrict__ u3,
475  const T * __restrict__ v1,
476  const T * __restrict__ v2,
477  const T * __restrict__ v3,
478  const int n) {
479 
480  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
481  const int str = blockDim.x * gridDim.x;
482 
483  for (int i = idx; i < n; i += str) {
484  dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
485  }
486 
487 }
488 
492 template< typename T >
493 __global__ void vcross_kernel(T * __restrict__ u1,
494  T * __restrict__ u2,
495  T * __restrict__ u3,
496  const T * __restrict__ v1,
497  const T * __restrict__ v2,
498  const T * __restrict__ v3,
499  const T * __restrict__ w1,
500  const T * __restrict__ w2,
501  const T * __restrict__ w3,
502  const int n) {
503 
504  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
505  const int str = blockDim.x * gridDim.x;
506 
507  for (int i = idx; i < n; i += str) {
508  u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
509  u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
510  u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
511  }
512 }
513 
517 template< typename T>
518 __inline__ __device__ T reduce_warp(T val) {
519  val += __shfl_down(val, 32);
520  val += __shfl_down(val, 16);
521  val += __shfl_down(val, 8);
522  val += __shfl_down(val, 4);
523  val += __shfl_down(val, 2);
524  val += __shfl_down(val, 1);
525  return val;
526 }
527 
531 template< typename T >
532 __global__ void reduce_kernel(T * bufred, const int n) {
533 
534  T sum = 0;
535  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
536  const int str = blockDim.x * gridDim.x;
537  for (int i = idx; i<n ; i += str)
538  {
539  sum += bufred[i];
540  }
541 
542  __shared__ T shared[64];
543  unsigned int lane = threadIdx.x % warpSize;
544  unsigned int wid = threadIdx.x / warpSize;
545 
546  sum = reduce_warp<T>(sum);
547  if (lane == 0)
548  shared[wid] = sum;
549  __syncthreads();
550 
551  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
552  if (wid == 0)
553  sum = reduce_warp<T>(sum);
554 
555  if (threadIdx.x == 0)
556  bufred[blockIdx.x] = sum;
557 }
558 
563 template< typename T >
564 __global__ void glsc3_reduce_kernel( T * bufred,
565  const int n,
566  const int j
567  ) {
568  __shared__ T buf[1024] ;
569  const int idx = threadIdx.x;
570  const int y= blockIdx.x;
571  const int step = blockDim.x;
572 
573  buf[idx]=0;
574  for (int i=idx ; i<n ; i+=step)
575  {
576  buf[idx] += bufred[i*j + y];
577  }
578  __syncthreads();
579 
580  int i = 512;
581  while (i != 0)
582  {
583  if(threadIdx.x < i && (threadIdx.x + i) < n )
584  {
585  buf[threadIdx.x] += buf[threadIdx.x + i] ;
586  }
587  i = i>>1;
588  __syncthreads();
589  }
590 
591  bufred[y] = buf[0];
592 }
593 
597 template< typename T >
598 __global__ void glsc3_kernel(const T * a,
599  const T * b,
600  const T * c,
601  T * buf_h,
602  const int n) {
603 
604  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
605  const int str = blockDim.x * gridDim.x;
606 
607  const unsigned int lane = threadIdx.x % warpSize;
608  const unsigned int wid = threadIdx.x / warpSize;
609 
610  __shared__ T shared[64];
611  T sum = 0.0;
612  for (int i = idx; i < n; i+= str) {
613  sum += a[i] * b[i] * c[i];
614  }
615 
616  sum = reduce_warp<T>(sum);
617  if (lane == 0)
618  shared[wid] = sum;
619  __syncthreads();
620 
621  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
622  if (wid == 0)
623  sum = reduce_warp<T>(sum);
624 
625  if (threadIdx.x == 0)
626  buf_h[blockIdx.x] = sum;
627 }
628 
632 template< typename T >
633 __global__ void glsc3_many_kernel(const T * a,
634  const T ** b,
635  const T * c,
636  T * buf_h,
637  const int j,
638  const int n) {
639 
640  const int idx = blockIdx.x * blockDim.y + threadIdx.y;
641  const int str = blockDim.y * gridDim.x;
642  const int y = threadIdx.x;
643 
644  __shared__ T buf[1024];
645  T tmp = 0;
646  if(y < j){
647  for (int i = idx; i < n; i+= str) {
648  tmp += a[i] * b[threadIdx.x][i] * c[i];
649  }
650  }
651 
652  buf[threadIdx.y*blockDim.x+y] = tmp;
653  __syncthreads();
654 
655  int i = blockDim.y>>1;
656  while (i != 0) {
657  if (threadIdx.y < i) {
658  buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
659  }
660  __syncthreads();
661  i = i>>1;
662  }
663  if (threadIdx.y == 0) {
664  if( y < j) {
665  buf_h[j*blockIdx.x+y] = buf[y];
666  }
667  }
668 }
669 
670 
674 template< typename T >
675 __global__ void glsc2_kernel(const T * a,
676  const T * b,
677  T * buf_h,
678  const int n) {
679 
680  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
681  const int str = blockDim.x * gridDim.x;
682 
683  const unsigned int lane = threadIdx.x % warpSize;
684  const unsigned int wid = threadIdx.x / warpSize;
685 
686  __shared__ T shared[64];
687  T sum = 0.0;
688  for (int i = idx; i < n; i+= str) {
689  sum += a[i] * b[i];
690  }
691 
692  sum = reduce_warp<T>(sum);
693  if (lane == 0)
694  shared[wid] = sum;
695  __syncthreads();
696 
697  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
698  if (wid == 0)
699  sum = reduce_warp<T>(sum);
700 
701  if (threadIdx.x == 0)
702  buf_h[blockIdx.x] = sum;
703 }
704 
708 template< typename T >
709 __global__ void glsum_kernel(const T * a,
710  T * buf_h,
711  const int n) {
712 
713  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
714  const int str = blockDim.x * gridDim.x;
715 
716  const unsigned int lane = threadIdx.x % warpSize;
717  const unsigned int wid = threadIdx.x / warpSize;
718 
719  __shared__ T shared[64];
720  T sum = 0;
721  for (int i = idx; i<n ; i += str)
722  {
723  sum += a[i];
724  }
725 
726  sum = reduce_warp<T>(sum);
727  if (lane == 0)
728  shared[wid] = sum;
729  __syncthreads();
730 
731  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
732  if (wid == 0)
733  sum = reduce_warp<T>(sum);
734 
735  if (threadIdx.x == 0)
736  buf_h[blockIdx.x] = sum;
737 
738 }
739 
743 template< typename T >
744 __global__ void absval_kernel(T * __restrict__ a,
745  const int n) {
746 
747  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
748  const int str = blockDim.x * gridDim.x;
749 
750  for (int i = idx; i < n; i += str) {
751  a[i] = fabs(a[i]);
752  }
753 }
754 
755 #endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:453
__global__ void reduce_kernel(T *bufred, const int n)
Definition: math_kernel.h:534
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:336
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:176
__inline__ __device__ T reduce_warp(T val)
Definition: math_kernel.h:521
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:76
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
Definition: math_kernel.h:636
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
Definition: math_kernel.h:94
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
Definition: math_kernel.h:566
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
Definition: math_kernel.h:301
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:227
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
Definition: math_kernel.h:244
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:435
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:352
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:368
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:402
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition: math_kernel.h:677
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:110
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:418
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition: math_kernel.h:712
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
Definition: math_kernel.h:601
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:267
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
Definition: math_kernel.h:472
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:320
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:192
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:209
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:160
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:58
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
Definition: math_kernel.h:494
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:284
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:143
__global__ void absval_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:747
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:385
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:127
real * bufred
Definition: math.cu:479
real * buf
Definition: pipecg_aux.cu:42