1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
40 template<
typename T >
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
48 for (
int i = idx;
i < n;
i += str) {
57 template<
typename T >
60 int * __restrict__ mask,
64 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
65 const int str = blockDim.x * gridDim.x;
67 for (
int i = idx;
i < m;
i += str) {
68 a[
i] = b[mask[
i+1]-1];
75 template<
typename T >
78 int * __restrict__ mask,
82 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 const int str = blockDim.x * gridDim.x;
85 for (
int i = idx;
i < m;
i += str) {
86 a[mask[
i+1]-1] = b[mask[
i+1]-1];
97 int* __restrict__ mask,
98 const int mask_size) {
100 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
101 const int str = blockDim.x * gridDim.x;
103 for (
int i = idx;
i < mask_size;
i += str) { a[mask[
i]-1] = c; }
109 template<
typename T >
115 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
116 const int str = blockDim.x * gridDim.x;
118 for (
int i = idx;
i < n;
i += str) {
126 template<
typename T >
131 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
132 const int str = blockDim.x * gridDim.x;
134 for (
int i = idx;
i < n;
i += str) {
142 template<
typename T >
148 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
149 const int str = blockDim.x * gridDim.x;
151 for (
int i = idx;
i < n;
i += str) {
159 template<
typename T >
164 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
165 const int str = blockDim.x * gridDim.x;
167 for (
int i = idx;
i < n;
i += str) {
175 template<
typename T >
177 const T * __restrict__ b,
180 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
181 const int str = blockDim.x * gridDim.x;
183 for (
int i = idx;
i < n;
i += str) {
191 template<
typename T >
193 const T * __restrict__ b,
194 const T * __restrict__ c,
197 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
198 const int str = blockDim.x * gridDim.x;
200 for (
int i = idx;
i < n;
i += str) {
208 template<
typename T >
210 const T * __restrict__ b,
211 const T * __restrict__ c,
212 const T * __restrict__ d,
215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216 const int str = blockDim.x * gridDim.x;
218 for (
int i = idx;
i < n;
i += str) {
219 a[
i] = b[
i] + c[
i] + d[
i];
226 template<
typename T >
228 const T * __restrict__ b,
232 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
233 const int str = blockDim.x * gridDim.x;
235 for (
int i = idx;
i < n;
i += str) {
236 a[
i] = c1 * a[
i] + b[
i];
243 template<
typename T >
250 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
251 const int str = blockDim.x * gridDim.x;
254 for (
int i = idx;
i < n;
i+= str) {
256 for (
int j = 0;
j < p_cur;
j ++) {
257 tmp += p[
j][
i]*alpha[
j];
266 template<
typename T >
268 const T * __restrict__ b,
272 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
273 const int str = blockDim.x * gridDim.x;
275 for (
int i = idx;
i < n;
i += str) {
276 a[
i] = a[
i] + c1 * b[
i];
283 template<
typename T >
285 const T * __restrict__ b,
289 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
290 const int str = blockDim.x * gridDim.x;
292 for (
int i = idx;
i < n;
i += str) {
293 a[
i] = a[
i] + c1 * (b[
i] * b[
i]);
300 template<
typename T >
302 const T * __restrict__ b,
303 const T * __restrict__ c,
308 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
309 const int str = blockDim.x * gridDim.x;
311 for (
int i = idx;
i < n;
i += str) {
312 a[
i] = c1 * b[
i] + c2 * c[
i];
319 template<
typename T >
323 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 const int str = blockDim.x * gridDim.x;
327 for (
int i = idx;
i < n;
i += str) {
335 template<
typename T >
337 const T * __restrict__ b,
340 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
341 const int str = blockDim.x * gridDim.x;
343 for (
int i = idx;
i < n;
i += str) {
351 template<
typename T >
353 const T * __restrict__ b,
356 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
357 const int str = blockDim.x * gridDim.x;
359 for (
int i = idx;
i < n;
i += str) {
367 template<
typename T >
369 const T * __restrict__ b,
370 const T * __restrict__ c,
373 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
374 const int str = blockDim.x * gridDim.x;
376 for (
int i = idx;
i < n;
i += str) {
384 template<
typename T >
386 const T * __restrict__ b,
387 const T * __restrict__ c,
390 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
391 const int str = blockDim.x * gridDim.x;
393 for (
int i = idx;
i < n;
i += str) {
394 a[
i] = a[
i] - b[
i] * c[
i];
401 template<
typename T >
403 const T * __restrict__ b,
406 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
407 const int str = blockDim.x * gridDim.x;
409 for (
int i = idx;
i < n;
i += str) {
417 template<
typename T >
419 const T * __restrict__ b,
420 const T * __restrict__ c,
423 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
424 const int str = blockDim.x * gridDim.x;
426 for (
int i = idx;
i < n;
i += str) {
434 template<
typename T >
436 const T * __restrict__ b,
437 const T * __restrict__ c,
440 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
441 const int str = blockDim.x * gridDim.x;
443 for (
int i = idx;
i < n;
i += str) {
444 a[
i] = a[
i] + b[
i] * c[
i];
452 template<
typename T >
454 const T * __restrict__ b,
455 const T * __restrict__ c,
456 const T * __restrict__ d,
459 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
460 const int str = blockDim.x * gridDim.x;
462 for (
int i = idx;
i < n;
i += str) {
463 a[
i] = a[
i] + b[
i] * c[
i] * d[
i];
471 template<
typename T >
473 const T * __restrict__ u1,
474 const T * __restrict__ u2,
475 const T * __restrict__ u3,
476 const T * __restrict__ v1,
477 const T * __restrict__ v2,
478 const T * __restrict__ v3,
481 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
482 const int str = blockDim.x * gridDim.x;
484 for (
int i = idx;
i < n;
i += str) {
485 dot[
i] = u1[
i] * v1[
i] + u2[
i] * v2[
i] + u3[
i] * v3[
i];
493 template<
typename T >
497 const T * __restrict__ v1,
498 const T * __restrict__ v2,
499 const T * __restrict__ v3,
500 const T * __restrict__ w1,
501 const T * __restrict__ w2,
502 const T * __restrict__
w3,
505 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
506 const int str = blockDim.x * gridDim.x;
508 for (
int i = idx;
i < n;
i += str) {
509 u1[
i] = v2[
i]*
w3[
i] - v3[
i]*w2[
i];
510 u2[
i] = v3[
i]*w1[
i] - v1[
i]*
w3[
i];
511 u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
520 template<
typename T>
522 val += __shfl_down_sync(0xffffffff, val, 16);
523 val += __shfl_down_sync(0xffffffff, val, 8);
524 val += __shfl_down_sync(0xffffffff, val, 4);
525 val += __shfl_down_sync(0xffffffff, val, 2);
526 val += __shfl_down_sync(0xffffffff, val, 1);
533 template<
typename T >
537 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
538 const int str = blockDim.x * gridDim.x;
539 for (
int i = idx;
i<n ;
i += str)
544 __shared__ T shared[32];
545 unsigned int lane = threadIdx.x % warpSize;
546 unsigned int wid = threadIdx.x / warpSize;
548 sum = reduce_warp<T>(sum);
553 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
555 sum = reduce_warp<T>(sum);
557 if (threadIdx.x == 0)
565 template<
typename T >
570 __shared__ T
buf[1024] ;
571 const int idx = threadIdx.x;
572 const int y= blockIdx.x;
573 const int step = blockDim.x;
576 for (
int i=idx ;
i<n ;
i+=step)
585 if(threadIdx.x <
i && (threadIdx.x +
i) < n )
587 buf[threadIdx.x] +=
buf[threadIdx.x +
i] ;
600 template<
typename T >
607 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
608 const int str = blockDim.x * gridDim.x;
610 const unsigned int lane = threadIdx.x % warpSize;
611 const unsigned int wid = threadIdx.x / warpSize;
613 __shared__ T shared[32];
615 for (
int i = idx;
i < n;
i+= str) {
616 sum += a[
i] * b[
i] * c[
i];
619 sum = reduce_warp<T>(sum);
624 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
626 sum = reduce_warp<T>(sum);
628 if (threadIdx.x == 0)
629 buf_h[blockIdx.x] = sum;
635 template<
typename T >
643 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
644 const int str = blockDim.y * gridDim.x;
645 const int y = threadIdx.x;
647 __shared__ T
buf[1024];
650 for (
int i = idx;
i < n;
i+= str) {
651 tmp += a[
i] * b[threadIdx.x][
i] * c[
i];
655 buf[threadIdx.y*blockDim.x+y] = tmp;
658 int i = blockDim.y>>1;
660 if (threadIdx.y <
i) {
661 buf[threadIdx.y*blockDim.x +y] +=
buf[(threadIdx.y +
i)*blockDim.x+y];
666 if (threadIdx.y == 0) {
668 buf_h[
j*blockIdx.x+y] =
buf[y];
676 template<
typename T >
682 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
683 const int str = blockDim.x * gridDim.x;
685 const unsigned int lane = threadIdx.x % warpSize;
686 const unsigned int wid = threadIdx.x / warpSize;
688 __shared__ T shared[32];
690 for (
int i = idx;
i < n;
i+= str) {
694 sum = reduce_warp<T>(sum);
699 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
701 sum = reduce_warp<T>(sum);
703 if (threadIdx.x == 0)
704 buf_h[blockIdx.x] = sum;
711 template<
typename T >
716 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
717 const int str = blockDim.x * gridDim.x;
719 const unsigned int lane = threadIdx.x % warpSize;
720 const unsigned int wid = threadIdx.x / warpSize;
722 __shared__ T shared[32];
724 for (
int i = idx;
i<n ;
i += str)
729 sum = reduce_warp<T>(sum);
734 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
736 sum = reduce_warp<T>(sum);
738 if (threadIdx.x == 0)
739 buf_h[blockIdx.x] = sum;
746 template<
typename T >
750 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
751 const int str = blockDim.x * gridDim.x;
753 for (
int i = idx;
i < n;
i += str) {
765 template <
typename T>
769 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
770 const int str = blockDim.x * gridDim.x;
772 for (
int i = idx;
i < n;
i += str) a[
i] =
max(a[
i], b[
i]);
779 template <
typename T>
781 T* __restrict__ a,
const T* __restrict__ b,
const T* __restrict__ c,
784 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
785 const int str = blockDim.x * gridDim.x;
787 for (
int i = idx;
i < n;
i += str) a[
i] =
max(b[
i], c[
i]);
794 template <
typename T>
797 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
798 const int str = blockDim.x * gridDim.x;
800 for (
int i = idx;
i < n;
i += str) a[
i] =
max(a[
i], c);
807 template <
typename T>
809 T* __restrict__ a,
const T* __restrict b,
const T c,
const int n) {
811 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
812 const int str = blockDim.x * gridDim.x;
814 for (
int i = idx;
i < n;
i += str) a[
i] =
max(b[
i], c);
821 template <
typename T>
825 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
826 const int str = blockDim.x * gridDim.x;
828 for (
int i = idx;
i < n;
i += str) a[
i] = min(a[
i], b[
i]);
835 template <
typename T>
837 T* __restrict__ a,
const T* __restrict__ b,
const T* __restrict__ c,
840 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
841 const int str = blockDim.x * gridDim.x;
843 for (
int i = idx;
i < n;
i += str) a[
i] = min(b[
i], c[
i]);
850 template <
typename T>
853 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
854 const int str = blockDim.x * gridDim.x;
856 for (
int i = idx;
i < n;
i += str) a[
i] = min(a[
i], c);
863 template <
typename T>
865 T* __restrict__ a,
const T* __restrict b,
const T c,
const int n) {
867 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
868 const int str = blockDim.x * gridDim.x;
870 for (
int i = idx;
i < n;
i += str) a[
i] = min(b[
i], c);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)