1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
40 template<
typename T >
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
48 for (
int i = idx;
i < n;
i += str) {
56 template<
typename T >
59 int * __restrict__ mask,
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
66 for (
int i = idx;
i < m;
i += str) {
67 a[mask[
i+1]-1] = b[mask[
i+1]-1];
74 template<
typename T >
80 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
81 const int str = blockDim.x * gridDim.x;
83 for (
int i = idx;
i < n;
i += str) {
91 template<
typename T >
96 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
97 const int str = blockDim.x * gridDim.x;
99 for (
int i = idx;
i < n;
i += str) {
107 template<
typename T >
112 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
113 const int str = blockDim.x * gridDim.x;
115 for (
int i = idx;
i < n;
i += str) {
123 template<
typename T >
125 const T * __restrict__ b,
128 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
129 const int str = blockDim.x * gridDim.x;
131 for (
int i = idx;
i < n;
i += str) {
139 template<
typename T >
141 const T * __restrict__ b,
145 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
146 const int str = blockDim.x * gridDim.x;
148 for (
int i = idx;
i < n;
i += str) {
149 a[
i] = c1 * a[
i] + b[
i];
156 template<
typename T >
163 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
164 const int str = blockDim.x * gridDim.x;
167 for (
int i = idx;
i < n;
i+= str) {
169 for (
int j = 0;
j < p_cur;
j ++) {
170 tmp += p[
j][
i]*alpha[
j];
179 template<
typename T >
181 const T * __restrict__ b,
185 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
186 const int str = blockDim.x * gridDim.x;
188 for (
int i = idx;
i < n;
i += str) {
189 a[
i] = a[
i] + c1 * b[
i];
196 template<
typename T >
198 const T * __restrict__ b,
202 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
203 const int str = blockDim.x * gridDim.x;
205 for (
int i = idx;
i < n;
i += str) {
206 a[
i] = a[
i] + c1 * (b[
i] * b[
i]);
213 template<
typename T >
215 const T * __restrict__ b,
216 const T * __restrict__ c,
221 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
222 const int str = blockDim.x * gridDim.x;
224 for (
int i = idx;
i < n;
i += str) {
225 a[
i] = c1 * b[
i] + c2 * c[
i];
232 template<
typename T >
236 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
237 const int str = blockDim.x * gridDim.x;
240 for (
int i = idx;
i < n;
i += str) {
248 template<
typename T >
250 const T * __restrict__ b,
253 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
254 const int str = blockDim.x * gridDim.x;
256 for (
int i = idx;
i < n;
i += str) {
264 template<
typename T >
266 const T * __restrict__ b,
269 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
270 const int str = blockDim.x * gridDim.x;
272 for (
int i = idx;
i < n;
i += str) {
280 template<
typename T >
282 const T * __restrict__ b,
283 const T * __restrict__ c,
286 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
287 const int str = blockDim.x * gridDim.x;
289 for (
int i = idx;
i < n;
i += str) {
297 template<
typename T >
299 const T * __restrict__ b,
300 const T * __restrict__ c,
303 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
304 const int str = blockDim.x * gridDim.x;
306 for (
int i = idx;
i < n;
i += str) {
307 a[
i] = a[
i] - b[
i] * c[
i];
314 template<
typename T >
316 const T * __restrict__ b,
319 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320 const int str = blockDim.x * gridDim.x;
322 for (
int i = idx;
i < n;
i += str) {
330 template<
typename T >
332 const T * __restrict__ b,
333 const T * __restrict__ c,
336 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
337 const int str = blockDim.x * gridDim.x;
339 for (
int i = idx;
i < n;
i += str) {
347 template<
typename T >
349 const T * __restrict__ b,
350 const T * __restrict__ c,
353 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
354 const int str = blockDim.x * gridDim.x;
356 for (
int i = idx;
i < n;
i += str) {
357 a[
i] = a[
i] + b[
i] * c[
i];
365 template<
typename T >
367 const T * __restrict__ b,
368 const T * __restrict__ c,
369 const T * __restrict__ d,
372 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
373 const int str = blockDim.x * gridDim.x;
375 for (
int i = idx;
i < n;
i += str) {
376 a[
i] = a[
i] + b[
i] * c[
i] * d[
i];
384 template<
typename T >
386 const T * __restrict__ u1,
387 const T * __restrict__ u2,
388 const T * __restrict__ u3,
389 const T * __restrict__ v1,
390 const T * __restrict__ v2,
391 const T * __restrict__ v3,
394 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
395 const int str = blockDim.x * gridDim.x;
397 for (
int i = idx;
i < n;
i += str) {
398 dot[
i] = u1[
i] * v1[
i] + u2[
i] * v2[
i] + u3[
i] * v3[
i];
406 template<
typename T>
408 val += __shfl_down(val, 32);
409 val += __shfl_down(val, 16);
410 val += __shfl_down(val, 8);
411 val += __shfl_down(val, 4);
412 val += __shfl_down(val, 2);
413 val += __shfl_down(val, 1);
420 template<
typename T >
424 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425 const int str = blockDim.x * gridDim.x;
426 for (
int i = idx;
i<n ;
i += str)
431 __shared__ T shared[64];
432 unsigned int lane = threadIdx.x % warpSize;
433 unsigned int wid = threadIdx.x / warpSize;
435 sum = reduce_warp<T>(sum);
440 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
442 sum = reduce_warp<T>(sum);
444 if (threadIdx.x == 0)
452 template<
typename T >
457 __shared__ T
buf[1024] ;
458 const int idx = threadIdx.x;
459 const int y= blockIdx.x;
460 const int step = blockDim.x;
463 for (
int i=idx ;
i<n ;
i+=step)
472 if(threadIdx.x <
i && (threadIdx.x +
i) < n )
474 buf[threadIdx.x] +=
buf[threadIdx.x +
i] ;
486 template<
typename T >
493 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
494 const int str = blockDim.x * gridDim.x;
496 const unsigned int lane = threadIdx.x % warpSize;
497 const unsigned int wid = threadIdx.x / warpSize;
499 __shared__ T shared[64];
501 for (
int i = idx;
i < n;
i+= str) {
502 sum += a[
i] * b[
i] * c[
i];
505 sum = reduce_warp<T>(sum);
510 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
512 sum = reduce_warp<T>(sum);
514 if (threadIdx.x == 0)
515 buf_h[blockIdx.x] = sum;
521 template<
typename T >
529 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
530 const int str = blockDim.y * gridDim.x;
531 const int y = threadIdx.x;
533 __shared__ T
buf[1024];
536 for (
int i = idx;
i < n;
i+= str) {
537 tmp += a[
i] * b[threadIdx.x][
i] * c[
i];
541 buf[threadIdx.y*blockDim.x+y] = tmp;
544 int i = blockDim.y>>1;
546 if (threadIdx.y <
i) {
547 buf[threadIdx.y*blockDim.x +y] +=
buf[(threadIdx.y +
i)*blockDim.x+y];
552 if (threadIdx.y == 0) {
554 buf_h[
j*blockIdx.x+y] =
buf[y];
563 template<
typename T >
569 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
570 const int str = blockDim.x * gridDim.x;
572 const unsigned int lane = threadIdx.x % warpSize;
573 const unsigned int wid = threadIdx.x / warpSize;
575 __shared__ T shared[64];
577 for (
int i = idx;
i < n;
i+= str) {
581 sum = reduce_warp<T>(sum);
586 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
588 sum = reduce_warp<T>(sum);
590 if (threadIdx.x == 0)
591 buf_h[blockIdx.x] = sum;
597 template<
typename T >
602 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
603 const int str = blockDim.x * gridDim.x;
605 const unsigned int lane = threadIdx.x % warpSize;
606 const unsigned int wid = threadIdx.x / warpSize;
608 __shared__ T shared[64];
610 for (
int i = idx;
i<n ;
i += str)
615 sum = reduce_warp<T>(sum);
620 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
622 sum = reduce_warp<T>(sum);
624 if (threadIdx.x == 0)
625 buf_h[blockIdx.x] = sum;
__global__ void const T *__restrict__ x
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)