1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
48 for (
int i = idx;
i < n;
i +=
str) {
104 for (
int i = idx;
i < m;
i +=
str) {
105#if __CUDA_ARCH__ >= 600
114template<
typename T >
137 const int mask_size) {
142 for (
int i = idx;
i < mask_size;
i +=
str) {
a[
mask[
i]] = c; }
148template<
typename T >
157 for (
int i = idx;
i < n;
i +=
str) {
165template<
typename T >
173 for (
int i = idx;
i < n;
i +=
str) {
181template<
typename T >
190 for (
int i = idx;
i < n;
i +=
str) {
198template<
typename T >
206 for (
int i = idx;
i < n;
i +=
str) {
214template<
typename T >
223 for (
int i = idx;
i < n;
i +=
str) {
231template<
typename T >
239 for (
int i = idx;
i < n;
i +=
str) {
247template<
typename T >
255 for (
int i = idx;
i < n;
i +=
str) {
263template<
typename T >
272 for (
int i = idx;
i < n;
i +=
str) {
280template<
typename T >
290 for (
int i = idx;
i < n;
i +=
str) {
291 a[
i] = b[
i] + c[
i] + d[
i];
298template<
typename T >
307 for (
int i = idx;
i < n;
i +=
str) {
315template<
typename T >
326 for (
int i = idx;
i < n;
i+=
str) {
329 tmp += p[
j][
i]*alpha[
j];
338template<
typename T >
347 for (
int i = idx;
i < n;
i +=
str) {
355template<
typename T >
364 for (
int i = idx;
i < n;
i +=
str) {
372template<
typename T >
383 for (
int i = idx;
i < n;
i +=
str) {
391template<
typename T >
404 for (
int i = idx;
i < n;
i +=
str) {
412template<
typename T >
427 for (
int i = idx;
i < n;
i +=
str) {
435template<
typename T >
443 for (
int i = idx;
i < n;
i +=
str) {
451template<
typename T >
459 for (
int i = idx;
i < n;
i +=
str) {
467template<
typename T >
476 for (
int i = idx;
i < n;
i +=
str) {
484template<
typename T >
492 for (
int i = idx;
i < n;
i +=
str) {
500template<
typename T >
509 for (
int i = idx;
i < n;
i +=
str) {
517template<
typename T >
526 for (
int i = idx;
i < n;
i +=
str) {
527 a[
i] =
a[
i] - b[
i] * c[
i];
534template<
typename T >
542 for (
int i = idx;
i < n;
i +=
str) {
550template<
typename T >
559 for (
int i = idx;
i < n;
i +=
str) {
567template<
typename T >
576 for (
int i = idx;
i < n;
i +=
str) {
577 a[
i] =
a[
i] + b[
i] * c[
i];
585template<
typename T >
595 for (
int i = idx;
i < n;
i +=
str) {
596 a[
i] =
a[
i] + b[
i] * c[
i] * d[
i];
604template<
typename T >
614 for (
int i = idx;
i < n;
i +=
str) {
615 a[
i] =
a[
i] + s * b[
i] * c[
i];
623template<
typename T >
636 for (
int i = idx;
i < n;
i +=
str) {
645template<
typename T >
660 for (
int i = idx;
i < n;
i +=
str) {
663 u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
685template<
typename T >
691 for (
int i = idx;
i<n ;
i +=
str)
718template<
typename T >
729 for (
int i=idx ;
i<n ;
i+=step)
753template<
typename T >
768 for (
int i = idx;
i < n;
i+=
str) {
788template<
typename T >
803 for (
int i = idx;
i < n;
i+=
str) {
829template<
typename T >
843 for (
int i = idx;
i < n;
i+=
str) {
864template<
typename T >
878 for (
int i = idx;
i < n;
i+=
str) {
899template<
typename T >
912 for (
int i = idx;
i<n ;
i +=
str)
934template<
typename T >
941 for (
int i = idx;
i < n;
i +=
str) {
1002 for (
int i = idx;
i < n;
i +=
str)
a[
i] =
max(b[
i], c);
1009template <
typename T>
1023template <
typename T>
1038template <
typename T>
1051template <
typename T>
1058 for (
int i = idx;
i < n;
i +=
str)
a[
i] =
min(b[
i], c);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Object for handling masks in Neko.