1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
40 template<
typename T >
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
48 for (
int i = idx;
i < n;
i += str) {
56 template<
typename T >
59 int * __restrict__ mask,
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
66 for (
int i = idx;
i < m;
i += str) {
67 a[mask[
i+1]-1] = b[mask[
i+1]-1];
74 template<
typename T >
77 int * __restrict__ mask,
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
84 for (
int i = idx;
i < m;
i += str) {
85 a[
i] = b[mask[
i+1]-1];
92 template<
typename T >
96 int* __restrict__ mask,
97 const int mask_size) {
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
102 for (
int i = idx;
i < mask_size;
i += str) { a[mask[
i]-1] = c; }
108 template<
typename T >
114 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
115 const int str = blockDim.x * gridDim.x;
117 for (
int i = idx;
i < n;
i += str) {
125 template<
typename T >
130 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
131 const int str = blockDim.x * gridDim.x;
133 for (
int i = idx;
i < n;
i += str) {
141 template<
typename T >
147 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
148 const int str = blockDim.x * gridDim.x;
150 for (
int i = idx;
i < n;
i += str) {
158 template<
typename T >
163 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
164 const int str = blockDim.x * gridDim.x;
166 for (
int i = idx;
i < n;
i += str) {
174 template<
typename T >
176 const T * __restrict__ b,
179 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
180 const int str = blockDim.x * gridDim.x;
182 for (
int i = idx;
i < n;
i += str) {
190 template<
typename T >
192 const T * __restrict__ b,
193 const T * __restrict__ c,
196 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
197 const int str = blockDim.x * gridDim.x;
199 for (
int i = idx;
i < n;
i += str) {
207 template<
typename T >
209 const T * __restrict__ b,
210 const T * __restrict__ c,
211 const T * __restrict__ d,
214 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
215 const int str = blockDim.x * gridDim.x;
217 for (
int i = idx;
i < n;
i += str) {
218 a[
i] = b[
i] + c[
i] + d[
i];
225 template<
typename T >
227 const T * __restrict__ b,
231 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
232 const int str = blockDim.x * gridDim.x;
234 for (
int i = idx;
i < n;
i += str) {
235 a[
i] = c1 * a[
i] + b[
i];
242 template<
typename T >
249 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
250 const int str = blockDim.x * gridDim.x;
253 for (
int i = idx;
i < n;
i+= str) {
255 for (
int j = 0;
j < p_cur;
j ++) {
256 tmp += p[
j][
i]*alpha[
j];
265 template<
typename T >
267 const T * __restrict__ b,
271 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272 const int str = blockDim.x * gridDim.x;
274 for (
int i = idx;
i < n;
i += str) {
275 a[
i] = a[
i] + c1 * b[
i];
282 template<
typename T >
284 const T * __restrict__ b,
288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
289 const int str = blockDim.x * gridDim.x;
291 for (
int i = idx;
i < n;
i += str) {
292 a[
i] = a[
i] + c1 * (b[
i] * b[
i]);
299 template<
typename T >
301 const T * __restrict__ b,
302 const T * __restrict__ c,
307 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
308 const int str = blockDim.x * gridDim.x;
310 for (
int i = idx;
i < n;
i += str) {
311 a[
i] = c1 * b[
i] + c2 * c[
i];
318 template<
typename T >
322 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
323 const int str = blockDim.x * gridDim.x;
326 for (
int i = idx;
i < n;
i += str) {
334 template<
typename T >
336 const T * __restrict__ b,
339 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
340 const int str = blockDim.x * gridDim.x;
342 for (
int i = idx;
i < n;
i += str) {
350 template<
typename T >
352 const T * __restrict__ b,
355 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
356 const int str = blockDim.x * gridDim.x;
358 for (
int i = idx;
i < n;
i += str) {
366 template<
typename T >
368 const T * __restrict__ b,
369 const T * __restrict__ c,
372 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
373 const int str = blockDim.x * gridDim.x;
375 for (
int i = idx;
i < n;
i += str) {
383 template<
typename T >
385 const T * __restrict__ b,
386 const T * __restrict__ c,
389 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
390 const int str = blockDim.x * gridDim.x;
392 for (
int i = idx;
i < n;
i += str) {
393 a[
i] = a[
i] - b[
i] * c[
i];
400 template<
typename T >
402 const T * __restrict__ b,
405 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
406 const int str = blockDim.x * gridDim.x;
408 for (
int i = idx;
i < n;
i += str) {
416 template<
typename T >
418 const T * __restrict__ b,
419 const T * __restrict__ c,
422 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
423 const int str = blockDim.x * gridDim.x;
425 for (
int i = idx;
i < n;
i += str) {
433 template<
typename T >
435 const T * __restrict__ b,
436 const T * __restrict__ c,
439 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440 const int str = blockDim.x * gridDim.x;
442 for (
int i = idx;
i < n;
i += str) {
443 a[
i] = a[
i] + b[
i] * c[
i];
451 template<
typename T >
453 const T * __restrict__ b,
454 const T * __restrict__ c,
455 const T * __restrict__ d,
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
461 for (
int i = idx;
i < n;
i += str) {
462 a[
i] = a[
i] + b[
i] * c[
i] * d[
i];
470 template<
typename T >
472 const T * __restrict__ u1,
473 const T * __restrict__ u2,
474 const T * __restrict__ u3,
475 const T * __restrict__ v1,
476 const T * __restrict__ v2,
477 const T * __restrict__ v3,
480 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
481 const int str = blockDim.x * gridDim.x;
483 for (
int i = idx;
i < n;
i += str) {
484 dot[
i] = u1[
i] * v1[
i] + u2[
i] * v2[
i] + u3[
i] * v3[
i];
492 template<
typename T >
496 const T * __restrict__ v1,
497 const T * __restrict__ v2,
498 const T * __restrict__ v3,
499 const T * __restrict__ w1,
500 const T * __restrict__ w2,
501 const T * __restrict__
w3,
504 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
505 const int str = blockDim.x * gridDim.x;
507 for (
int i = idx;
i < n;
i += str) {
508 u1[
i] = v2[
i]*
w3[
i] - v3[
i]*w2[
i];
509 u2[
i] = v3[
i]*w1[
i] - v1[
i]*
w3[
i];
510 u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
517 template<
typename T>
519 val += __shfl_down(val, 32);
520 val += __shfl_down(val, 16);
521 val += __shfl_down(val, 8);
522 val += __shfl_down(val, 4);
523 val += __shfl_down(val, 2);
524 val += __shfl_down(val, 1);
531 template<
typename T >
535 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
536 const int str = blockDim.x * gridDim.x;
537 for (
int i = idx;
i<n ;
i += str)
542 __shared__ T shared[64];
543 unsigned int lane = threadIdx.x % warpSize;
544 unsigned int wid = threadIdx.x / warpSize;
546 sum = reduce_warp<T>(sum);
551 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
553 sum = reduce_warp<T>(sum);
555 if (threadIdx.x == 0)
563 template<
typename T >
568 __shared__ T
buf[1024] ;
569 const int idx = threadIdx.x;
570 const int y= blockIdx.x;
571 const int step = blockDim.x;
574 for (
int i=idx ;
i<n ;
i+=step)
583 if(threadIdx.x <
i && (threadIdx.x +
i) < n )
585 buf[threadIdx.x] +=
buf[threadIdx.x +
i] ;
597 template<
typename T >
604 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
605 const int str = blockDim.x * gridDim.x;
607 const unsigned int lane = threadIdx.x % warpSize;
608 const unsigned int wid = threadIdx.x / warpSize;
610 __shared__ T shared[64];
612 for (
int i = idx;
i < n;
i+= str) {
613 sum += a[
i] * b[
i] * c[
i];
616 sum = reduce_warp<T>(sum);
621 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
623 sum = reduce_warp<T>(sum);
625 if (threadIdx.x == 0)
626 buf_h[blockIdx.x] = sum;
632 template<
typename T >
640 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
641 const int str = blockDim.y * gridDim.x;
642 const int y = threadIdx.x;
644 __shared__ T
buf[1024];
647 for (
int i = idx;
i < n;
i+= str) {
648 tmp += a[
i] * b[threadIdx.x][
i] * c[
i];
652 buf[threadIdx.y*blockDim.x+y] = tmp;
655 int i = blockDim.y>>1;
657 if (threadIdx.y <
i) {
658 buf[threadIdx.y*blockDim.x +y] +=
buf[(threadIdx.y +
i)*blockDim.x+y];
663 if (threadIdx.y == 0) {
665 buf_h[
j*blockIdx.x+y] =
buf[y];
674 template<
typename T >
680 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
681 const int str = blockDim.x * gridDim.x;
683 const unsigned int lane = threadIdx.x % warpSize;
684 const unsigned int wid = threadIdx.x / warpSize;
686 __shared__ T shared[64];
688 for (
int i = idx;
i < n;
i+= str) {
692 sum = reduce_warp<T>(sum);
697 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
699 sum = reduce_warp<T>(sum);
701 if (threadIdx.x == 0)
702 buf_h[blockIdx.x] = sum;
708 template<
typename T >
713 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
714 const int str = blockDim.x * gridDim.x;
716 const unsigned int lane = threadIdx.x % warpSize;
717 const unsigned int wid = threadIdx.x / warpSize;
719 __shared__ T shared[64];
721 for (
int i = idx;
i<n ;
i += str)
726 sum = reduce_warp<T>(sum);
731 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
733 sum = reduce_warp<T>(sum);
735 if (threadIdx.x == 0)
736 buf_h[blockIdx.x] = sum;
743 template<
typename T >
747 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
748 const int str = blockDim.x * gridDim.x;
750 for (
int i = idx;
i < n;
i += str) {
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)