1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
40 template<
typename T >
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
48 for (
int i = idx;
i < n;
i += str) {
56 template<
typename T >
59 int * __restrict__ mask,
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
66 for (
int i = idx;
i < m;
i += str) {
67 a[mask[
i+1]-1] = b[mask[
i+1]-1];
78 int* __restrict__ mask,
79 const int mask_size) {
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
84 for (
int i = idx;
i < mask_size;
i += str) { a[mask[
i]-1] = c; }
90 template<
typename T >
96 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
97 const int str = blockDim.x * gridDim.x;
99 for (
int i = idx;
i < n;
i += str) {
107 template<
typename T >
112 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
113 const int str = blockDim.x * gridDim.x;
115 for (
int i = idx;
i < n;
i += str) {
123 template<
typename T >
129 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
130 const int str = blockDim.x * gridDim.x;
132 for (
int i = idx;
i < n;
i += str) {
140 template<
typename T >
145 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
146 const int str = blockDim.x * gridDim.x;
148 for (
int i = idx;
i < n;
i += str) {
156 template<
typename T >
158 const T * __restrict__ b,
161 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
162 const int str = blockDim.x * gridDim.x;
164 for (
int i = idx;
i < n;
i += str) {
172 template<
typename T >
174 const T * __restrict__ b,
175 const T * __restrict__ c,
178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
179 const int str = blockDim.x * gridDim.x;
181 for (
int i = idx;
i < n;
i += str) {
189 template<
typename T >
191 const T * __restrict__ b,
195 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
196 const int str = blockDim.x * gridDim.x;
198 for (
int i = idx;
i < n;
i += str) {
199 a[
i] = c1 * a[
i] + b[
i];
206 template<
typename T >
213 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
214 const int str = blockDim.x * gridDim.x;
217 for (
int i = idx;
i < n;
i+= str) {
219 for (
int j = 0;
j < p_cur;
j ++) {
220 tmp += p[
j][
i]*alpha[
j];
229 template<
typename T >
231 const T * __restrict__ b,
235 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
236 const int str = blockDim.x * gridDim.x;
238 for (
int i = idx;
i < n;
i += str) {
239 a[
i] = a[
i] + c1 * b[
i];
246 template<
typename T >
248 const T * __restrict__ b,
252 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
253 const int str = blockDim.x * gridDim.x;
255 for (
int i = idx;
i < n;
i += str) {
256 a[
i] = a[
i] + c1 * (b[
i] * b[
i]);
263 template<
typename T >
265 const T * __restrict__ b,
266 const T * __restrict__ c,
271 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272 const int str = blockDim.x * gridDim.x;
274 for (
int i = idx;
i < n;
i += str) {
275 a[
i] = c1 * b[
i] + c2 * c[
i];
282 template<
typename T >
286 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
287 const int str = blockDim.x * gridDim.x;
290 for (
int i = idx;
i < n;
i += str) {
298 template<
typename T >
300 const T * __restrict__ b,
303 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
304 const int str = blockDim.x * gridDim.x;
306 for (
int i = idx;
i < n;
i += str) {
314 template<
typename T >
316 const T * __restrict__ b,
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
322 for (
int i = idx;
i < n;
i += str) {
330 template<
typename T >
332 const T * __restrict__ b,
333 const T * __restrict__ c,
336 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
337 const int str = blockDim.x * gridDim.x;
339 for (
int i = idx;
i < n;
i += str) {
347 template<
typename T >
349 const T * __restrict__ b,
350 const T * __restrict__ c,
353 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
354 const int str = blockDim.x * gridDim.x;
356 for (
int i = idx;
i < n;
i += str) {
357 a[
i] = a[
i] - b[
i] * c[
i];
364 template<
typename T >
366 const T * __restrict__ b,
369 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
370 const int str = blockDim.x * gridDim.x;
372 for (
int i = idx;
i < n;
i += str) {
380 template<
typename T >
382 const T * __restrict__ b,
383 const T * __restrict__ c,
386 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
387 const int str = blockDim.x * gridDim.x;
389 for (
int i = idx;
i < n;
i += str) {
397 template<
typename T >
399 const T * __restrict__ b,
400 const T * __restrict__ c,
403 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
404 const int str = blockDim.x * gridDim.x;
406 for (
int i = idx;
i < n;
i += str) {
407 a[
i] = a[
i] + b[
i] * c[
i];
415 template<
typename T >
417 const T * __restrict__ b,
418 const T * __restrict__ c,
419 const T * __restrict__ d,
422 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
423 const int str = blockDim.x * gridDim.x;
425 for (
int i = idx;
i < n;
i += str) {
426 a[
i] = a[
i] + b[
i] * c[
i] * d[
i];
434 template<
typename T >
436 const T * __restrict__ u1,
437 const T * __restrict__ u2,
438 const T * __restrict__ u3,
439 const T * __restrict__ v1,
440 const T * __restrict__ v2,
441 const T * __restrict__ v3,
444 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
445 const int str = blockDim.x * gridDim.x;
447 for (
int i = idx;
i < n;
i += str) {
448 dot[
i] = u1[
i] * v1[
i] + u2[
i] * v2[
i] + u3[
i] * v3[
i];
456 template<
typename T>
458 val += __shfl_down_sync(0xffffffff, val, 16);
459 val += __shfl_down_sync(0xffffffff, val, 8);
460 val += __shfl_down_sync(0xffffffff, val, 4);
461 val += __shfl_down_sync(0xffffffff, val, 2);
462 val += __shfl_down_sync(0xffffffff, val, 1);
469 template<
typename T >
473 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
474 const int str = blockDim.x * gridDim.x;
475 for (
int i = idx;
i<n ;
i += str)
480 __shared__ T shared[32];
481 unsigned int lane = threadIdx.x % warpSize;
482 unsigned int wid = threadIdx.x / warpSize;
484 sum = reduce_warp<T>(sum);
489 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
491 sum = reduce_warp<T>(sum);
493 if (threadIdx.x == 0)
501 template<
typename T >
506 __shared__ T
buf[1024] ;
507 const int idx = threadIdx.x;
508 const int y= blockIdx.x;
509 const int step = blockDim.x;
512 for (
int i=idx ;
i<n ;
i+=step)
521 if(threadIdx.x <
i && (threadIdx.x +
i) < n )
523 buf[threadIdx.x] +=
buf[threadIdx.x +
i] ;
536 template<
typename T >
543 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
544 const int str = blockDim.x * gridDim.x;
546 const unsigned int lane = threadIdx.x % warpSize;
547 const unsigned int wid = threadIdx.x / warpSize;
549 __shared__ T shared[32];
551 for (
int i = idx;
i < n;
i+= str) {
552 sum += a[
i] * b[
i] * c[
i];
555 sum = reduce_warp<T>(sum);
560 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
562 sum = reduce_warp<T>(sum);
564 if (threadIdx.x == 0)
565 buf_h[blockIdx.x] = sum;
571 template<
typename T >
579 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
580 const int str = blockDim.y * gridDim.x;
581 const int y = threadIdx.x;
583 __shared__ T
buf[1024];
586 for (
int i = idx;
i < n;
i+= str) {
587 tmp += a[
i] * b[threadIdx.x][
i] * c[
i];
591 buf[threadIdx.y*blockDim.x+y] = tmp;
594 int i = blockDim.y>>1;
596 if (threadIdx.y <
i) {
597 buf[threadIdx.y*blockDim.x +y] +=
buf[(threadIdx.y +
i)*blockDim.x+y];
602 if (threadIdx.y == 0) {
604 buf_h[
j*blockIdx.x+y] =
buf[y];
612 template<
typename T >
618 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
619 const int str = blockDim.x * gridDim.x;
621 const unsigned int lane = threadIdx.x % warpSize;
622 const unsigned int wid = threadIdx.x / warpSize;
624 __shared__ T shared[32];
626 for (
int i = idx;
i < n;
i+= str) {
630 sum = reduce_warp<T>(sum);
635 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
637 sum = reduce_warp<T>(sum);
639 if (threadIdx.x == 0)
640 buf_h[blockIdx.x] = sum;
647 template<
typename T >
652 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
653 const int str = blockDim.x * gridDim.x;
655 const unsigned int lane = threadIdx.x % warpSize;
656 const unsigned int wid = threadIdx.x / warpSize;
658 __shared__ T shared[32];
660 for (
int i = idx;
i<n ;
i += str)
665 sum = reduce_warp<T>(sum);
670 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
672 sum = reduce_warp<T>(sum);
674 if (threadIdx.x == 0)
675 buf_h[blockIdx.x] = sum;
__global__ void const T *__restrict__ x
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)