1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
40 template<
typename T >
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
48 for (
int i = idx;
i < n;
i += str) {
56 template<
typename T >
59 int * __restrict__ mask,
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
66 for (
int i = idx;
i < m;
i += str) {
67 a[mask[
i+1]-1] = b[mask[
i+1]-1];
75 template<
typename T >
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
84 for (
int i = idx;
i < n;
i += str) {
92 template<
typename T >
97 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
98 const int str = blockDim.x * gridDim.x;
100 for (
int i = idx;
i < n;
i += str) {
108 template<
typename T >
113 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
114 const int str = blockDim.x * gridDim.x;
116 for (
int i = idx;
i < n;
i += str) {
124 template<
typename T >
126 const T * __restrict__ b,
129 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
130 const int str = blockDim.x * gridDim.x;
132 for (
int i = idx;
i < n;
i += str) {
140 template<
typename T >
142 const T * __restrict__ b,
146 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
147 const int str = blockDim.x * gridDim.x;
149 for (
int i = idx;
i < n;
i += str) {
150 a[
i] = c1 * a[
i] + b[
i];
157 template<
typename T >
164 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
165 const int str = blockDim.x * gridDim.x;
168 for (
int i = idx;
i < n;
i+= str) {
170 for (
int j = 0;
j < p_cur;
j ++) {
171 tmp += p[
j][
i]*alpha[
j];
180 template<
typename T >
182 const T * __restrict__ b,
186 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
187 const int str = blockDim.x * gridDim.x;
189 for (
int i = idx;
i < n;
i += str) {
190 a[
i] = a[
i] + c1 * b[
i];
197 template<
typename T >
199 const T * __restrict__ b,
203 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
204 const int str = blockDim.x * gridDim.x;
206 for (
int i = idx;
i < n;
i += str) {
207 a[
i] = a[
i] + c1 * (b[
i] * b[
i]);
214 template<
typename T >
216 const T * __restrict__ b,
217 const T * __restrict__ c,
222 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
223 const int str = blockDim.x * gridDim.x;
225 for (
int i = idx;
i < n;
i += str) {
226 a[
i] = c1 * b[
i] + c2 * c[
i];
233 template<
typename T >
237 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
238 const int str = blockDim.x * gridDim.x;
241 for (
int i = idx;
i < n;
i += str) {
249 template<
typename T >
251 const T * __restrict__ b,
254 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
255 const int str = blockDim.x * gridDim.x;
257 for (
int i = idx;
i < n;
i += str) {
265 template<
typename T >
267 const T * __restrict__ b,
270 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
271 const int str = blockDim.x * gridDim.x;
273 for (
int i = idx;
i < n;
i += str) {
281 template<
typename T >
283 const T * __restrict__ b,
284 const T * __restrict__ c,
287 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
288 const int str = blockDim.x * gridDim.x;
290 for (
int i = idx;
i < n;
i += str) {
298 template<
typename T >
300 const T * __restrict__ b,
301 const T * __restrict__ c,
304 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
305 const int str = blockDim.x * gridDim.x;
307 for (
int i = idx;
i < n;
i += str) {
308 a[
i] = a[
i] - b[
i] * c[
i];
315 template<
typename T >
317 const T * __restrict__ b,
320 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
321 const int str = blockDim.x * gridDim.x;
323 for (
int i = idx;
i < n;
i += str) {
331 template<
typename T >
333 const T * __restrict__ b,
334 const T * __restrict__ c,
337 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
338 const int str = blockDim.x * gridDim.x;
340 for (
int i = idx;
i < n;
i += str) {
348 template<
typename T >
350 const T * __restrict__ b,
351 const T * __restrict__ c,
354 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
355 const int str = blockDim.x * gridDim.x;
357 for (
int i = idx;
i < n;
i += str) {
358 a[
i] = a[
i] + b[
i] * c[
i];
366 template<
typename T >
368 const T * __restrict__ b,
369 const T * __restrict__ c,
370 const T * __restrict__ d,
373 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
374 const int str = blockDim.x * gridDim.x;
376 for (
int i = idx;
i < n;
i += str) {
377 a[
i] = a[
i] + b[
i] * c[
i] * d[
i];
385 template<
typename T >
387 const T * __restrict__ u1,
388 const T * __restrict__ u2,
389 const T * __restrict__ u3,
390 const T * __restrict__ v1,
391 const T * __restrict__ v2,
392 const T * __restrict__ v3,
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
398 for (
int i = idx;
i < n;
i += str) {
399 dot[
i] = u1[
i] * v1[
i] + u2[
i] * v2[
i] + u3[
i] * v3[
i];
407 template<
typename T>
409 val += __shfl_down_sync(0xffffffff, val, 16);
410 val += __shfl_down_sync(0xffffffff, val, 8);
411 val += __shfl_down_sync(0xffffffff, val, 4);
412 val += __shfl_down_sync(0xffffffff, val, 2);
413 val += __shfl_down_sync(0xffffffff, val, 1);
420 template<
typename T >
424 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425 const int str = blockDim.x * gridDim.x;
426 for (
int i = idx;
i<n ;
i += str)
431 __shared__ T shared[32];
432 unsigned int lane = threadIdx.x % warpSize;
433 unsigned int wid = threadIdx.x / warpSize;
435 sum = reduce_warp<T>(sum);
440 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
442 sum = reduce_warp<T>(sum);
444 if (threadIdx.x == 0)
452 template<
typename T >
457 __shared__ T
buf[1024] ;
458 const int idx = threadIdx.x;
459 const int y= blockIdx.x;
460 const int step = blockDim.x;
463 for (
int i=idx ;
i<n ;
i+=step)
472 if(threadIdx.x <
i && (threadIdx.x +
i) < n )
474 buf[threadIdx.x] +=
buf[threadIdx.x +
i] ;
487 template<
typename T >
494 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
495 const int str = blockDim.x * gridDim.x;
497 const unsigned int lane = threadIdx.x % warpSize;
498 const unsigned int wid = threadIdx.x / warpSize;
500 __shared__ T shared[32];
502 for (
int i = idx;
i < n;
i+= str) {
503 sum += a[
i] * b[
i] * c[
i];
506 sum = reduce_warp<T>(sum);
511 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
513 sum = reduce_warp<T>(sum);
515 if (threadIdx.x == 0)
516 buf_h[blockIdx.x] = sum;
522 template<
typename T >
530 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
531 const int str = blockDim.y * gridDim.x;
532 const int y = threadIdx.x;
534 __shared__ T
buf[1024];
537 for (
int i = idx;
i < n;
i+= str) {
538 tmp += a[
i] * b[threadIdx.x][
i] * c[
i];
542 buf[threadIdx.y*blockDim.x+y] = tmp;
545 int i = blockDim.y>>1;
547 if (threadIdx.y <
i) {
548 buf[threadIdx.y*blockDim.x +y] +=
buf[(threadIdx.y +
i)*blockDim.x+y];
553 if (threadIdx.y == 0) {
555 buf_h[
j*blockIdx.x+y] =
buf[y];
563 template<
typename T >
569 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
570 const int str = blockDim.x * gridDim.x;
572 const unsigned int lane = threadIdx.x % warpSize;
573 const unsigned int wid = threadIdx.x / warpSize;
575 __shared__ T shared[32];
577 for (
int i = idx;
i < n;
i+= str) {
581 sum = reduce_warp<T>(sum);
586 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
588 sum = reduce_warp<T>(sum);
590 if (threadIdx.x == 0)
591 buf_h[blockIdx.x] = sum;
598 template<
typename T >
603 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
604 const int str = blockDim.x * gridDim.x;
606 const unsigned int lane = threadIdx.x % warpSize;
607 const unsigned int wid = threadIdx.x / warpSize;
609 __shared__ T shared[32];
611 for (
int i = idx;
i<n ;
i += str)
616 sum = reduce_warp<T>(sum);
621 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
623 sum = reduce_warp<T>(sum);
625 if (threadIdx.x == 0)
626 buf_h[blockIdx.x] = sum;
__global__ void const T *__restrict__ x
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)