1#ifndef __MATH_MATH_KERNEL_H__ 
    2#define __MATH_MATH_KERNEL_H__ 
   48  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  111template< 
typename T >
 
  129template< 
typename T >
 
  145template< 
typename T >
 
  154  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  162template< 
typename T >
 
  170  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  178template< 
typename T >
 
  187  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  195template< 
typename T >
 
  203  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  211template< 
typename T >
 
  220  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  228template< 
typename T >
 
  236  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  244template< 
typename T >
 
  252  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  260template< 
typename T >
 
  269  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  277template< 
typename T >
 
  287  for (
int i = idx; 
i < n; 
i += 
str) {
 
  288    a[
i] = b[
i] + c[
i] + d[
i];
 
 
  295template< 
typename T >
 
  304  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  312template< 
typename T >
 
  323  for (
int i = idx; 
i < n; 
i+= 
str) {
 
  326      tmp += p[
j][
i]*alpha[
j];
 
 
  335template< 
typename T >
 
  344  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  352template< 
typename T >
 
  361  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  369template< 
typename T >
 
  380  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  388template< 
typename T >
 
  401  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  409template< 
typename T >
 
  424  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  432template< 
typename T >
 
  440  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  448template< 
typename T >
 
  456  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  464template< 
typename T >
 
  473  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  481template< 
typename T >
 
  489  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  497template< 
typename T >
 
  506  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  514template< 
typename T >
 
  523  for (
int i = idx; 
i < n; 
i += 
str) {
 
  524    a[
i] = 
a[
i] - b[
i] * c[
i];
 
 
  531template< 
typename T >
 
  539  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  547template< 
typename T >
 
  556  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  564template< 
typename T >
 
  573  for (
int i = idx; 
i < n; 
i += 
str) {
 
  574    a[
i] = 
a[
i] + b[
i] * c[
i];
 
 
  582template< 
typename T >
 
  592  for (
int i = idx; 
i < n; 
i += 
str) {
 
  593    a[
i] = 
a[
i] + b[
i] * c[
i] * d[
i];
 
 
  601template< 
typename T >
 
  611  for (
int i = idx; 
i < n; 
i += 
str) {
 
  612    a[
i] = 
a[
i] + s * b[
i] * c[
i];
 
 
  620template< 
typename T >
 
  633  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
  642template< 
typename T >
 
  657  for (
int i = idx; 
i < n; 
i += 
str) {
 
  660    u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
 
 
  681template< 
typename T >
 
  687  for (
int i = idx; 
i<n ; 
i += 
str)
 
 
  713template< 
typename T >
 
  724   for (
int i=idx ; 
i<n ; 
i+=step)
 
 
  747template< 
typename T >
 
  762  for (
int i = idx; 
i < n; 
i+= 
str) {
 
 
  782template< 
typename T >
 
  797    for (
int i = idx; 
i < n; 
i+= 
str) {
 
 
  823template< 
typename T >
 
  837  for (
int i = idx; 
i < n; 
i+= 
str) {
 
 
  858template< 
typename T >
 
  872  for (
int i = idx; 
i < n; 
i+= 
str) {
 
 
  893template< 
typename T >
 
  906  for (
int i = idx; 
i<n ; 
i += 
str)
 
 
  928template< 
typename T >
 
  935  for (
int i = idx; 
i < n; 
i += 
str) {
 
 
 1003template <
typename T>
 
 1017template <
typename T>
 
 1032template <
typename T>
 
 1045template <
typename T>
 
 1052    for (
int i = idx; 
i < n; 
i += 
str) 
a[
i] = 
min(b[
i], c);
 
 
__global__ void const T *__restrict__ x
 
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
 
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
 
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
 
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void reduce_kernel(T *bufred, const int n)
 
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
 
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
 
__inline__ __device__ T reduce_warp(T val)
 
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
 
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
 
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
 
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
 
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
 
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
 
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
 
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
 
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
 
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
 
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
 
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
 
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
 
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
 
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
 
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
 
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
 
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
 
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
 
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
 
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
 
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
 
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
 
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
 
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
 
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
 
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
 
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
 
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
 
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
 
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
 
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
 
__global__ void absval_kernel(T *__restrict__ a, const int n)
 
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
 
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
 
Object for handling masks in Neko.