1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
48 for (
int i = idx;
i < n;
i +=
str) {
92 const int ly,
const int lz) {
93 const int idx2 = idx - 1;
94 index[3] =
idx2 / (lx * ly * lz);
95 index[2] = (
idx2 - (lx * ly * lz) * index[3]) / (lx * ly);
96 index[1] = (
idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) / lx;
97 index[0] = (
idx2 - (lx * ly * lz) * index[3] - (lx * ly) * index[2]) -
107 const int n1,
const int n2,
const int nf) {
108 return ((
i) + (n1) * (((
j) - 1) + (n2) * (((
k) - 1) + (
nf) * (((l) - 1))))) - 1;
114template<
typename T >
130 for (
int m = idx; m <
n_mask; m +=
str) {
131 const int f = facet[m + 1];
155template<
typename T >
173template<
typename T >
192template<
typename T >
202 for (
int i = idx;
i < m;
i +=
str) {
203#if __CUDA_ARCH__ >= 600
212template<
typename T >
230template<
typename T >
253 const int mask_size) {
258 for (
int i = idx;
i < mask_size;
i +=
str) {
a[
mask[
i]] = c; }
264template<
typename T >
273 for (
int i = idx;
i < n;
i +=
str) {
281template<
typename T >
289 for (
int i = idx;
i < n;
i +=
str) {
297template<
typename T >
306 for (
int i = idx;
i < n;
i +=
str) {
314template<
typename T >
322 for (
int i = idx;
i < n;
i +=
str) {
330template<
typename T >
339 for (
int i = idx;
i < n;
i +=
str) {
347template<
typename T >
357 for (
int i = idx;
i < n;
i +=
str) {
365template<
typename T >
373 for (
int i = idx;
i < n;
i +=
str) {
381template<
typename T >
389 for (
int i = idx;
i < n;
i +=
str) {
397template<
typename T >
406 for (
int i = idx;
i < n;
i +=
str) {
414template<
typename T >
424 for (
int i = idx;
i < n;
i +=
str) {
425 a[
i] = b[
i] + c[
i] + d[
i];
432template<
typename T >
441 for (
int i = idx;
i < n;
i +=
str) {
449template<
typename T >
460 for (
int i = idx;
i < n;
i+=
str) {
463 tmp += p[
j][
i]*alpha[
j];
472template<
typename T >
481 for (
int i = idx;
i < n;
i +=
str) {
489template<
typename T >
498 for (
int i = idx;
i < n;
i +=
str) {
506template<
typename T >
517 for (
int i = idx;
i < n;
i +=
str) {
525template<
typename T >
538 for (
int i = idx;
i < n;
i +=
str) {
546template<
typename T >
561 for (
int i = idx;
i < n;
i +=
str) {
569template<
typename T >
577 for (
int i = idx;
i < n;
i +=
str) {
585template<
typename T >
593 for (
int i = idx;
i < n;
i +=
str) {
601template<
typename T >
610 for (
int i = idx;
i < n;
i +=
str) {
618template<
typename T >
626 for (
int i = idx;
i < n;
i +=
str) {
634template<
typename T >
643 for (
int i = idx;
i < n;
i +=
str) {
651template<
typename T >
660 for (
int i = idx;
i < n;
i +=
str) {
661 a[
i] =
a[
i] - b[
i] * c[
i];
668template<
typename T >
676 for (
int i = idx;
i < n;
i +=
str) {
684template<
typename T >
693 for (
int i = idx;
i < n;
i +=
str) {
701template<
typename T >
710 for (
int i = idx;
i < n;
i +=
str) {
711 a[
i] =
a[
i] + b[
i] * c[
i];
719template<
typename T >
729 for (
int i = idx;
i < n;
i +=
str) {
730 a[
i] =
a[
i] + b[
i] * c[
i] * d[
i];
738template<
typename T >
748 for (
int i = idx;
i < n;
i +=
str) {
749 a[
i] =
a[
i] + s * b[
i] * c[
i];
757template<
typename T >
770 for (
int i = idx;
i < n;
i +=
str) {
779template<
typename T >
794 for (
int i = idx;
i < n;
i +=
str) {
797 u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
845template<
typename T >
851 for (
int i = idx;
i<n ;
i +=
str)
876template<
typename T >
882 for (
int i = idx;
i<n ;
i +=
str)
907template<
typename T >
913 for (
int i = idx;
i<n ;
i +=
str)
939template<
typename T >
950 for (
int i=idx ;
i<n ;
i+=step)
974template<
typename T >
989 for (
int i = idx;
i < n;
i+=
str) {
1009template<
typename T >
1024 for (
int i = idx;
i < n;
i+=
str) {
1050template<
typename T >
1064 for (
int i = idx;
i < n;
i+=
str) {
1085template<
typename T >
1099 for (
int i = idx;
i < n;
i+=
str) {
1120template<
typename T >
1133 for (
int i = idx;
i<n ;
i +=
str)
1155template<
typename T >
1169 for (
int i = idx;
i<n ;
i +=
str)
1191template<
typename T >
1205 for (
int i = idx;
i<n ;
i +=
str)
1227template<
typename T >
1234 for (
int i = idx;
i < n;
i +=
str) {
1246template <
typename T>
1260template <
typename T>
1275template <
typename T>
1288template <
typename T>
1295 for (
int i = idx;
i < n;
i +=
str)
a[
i] =
max(b[
i], c);
1302template <
typename T>
1316template <
typename T>
1331template <
typename T>
1344template <
typename T>
1351 for (
int i = idx;
i < n;
i +=
str)
a[
i] =
min(b[
i], c);
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cwrap_kernel(T *__restrict__ a, const T min_val, const T max_val, const int n)
__global__ void pwmin_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void cdiv2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add4s3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T c1, const T c2, const T c3, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_scatter_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void pwmax_vec3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void addcol3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T s, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void cdiv_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_copy_kernel_aligned(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void face_masked_gather_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int *__restrict__ mask, const int *__restrict__ facet, const int n1, const int n2, const int lx, const int ly, const int lz, const int n_mask)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void masked_gather_copy_aligned_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void pwmax_sca2_kernel(T *__restrict__ a, const T c, const int n)
__device__ __forceinline__ void face_gather_nonlinear_index(int *index, const int idx, const int lx, const int ly, const int lz)
__global__ void reduce_max_kernel(T *bufred, const T ninf, const int n)
__global__ void glsubnorm2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void pwmin_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__inline__ __device__ T reduce_max_warp(T val)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__device__ __forceinline__ int face_gather_idx(const int i, const int j, const int k, const int l, const int n1, const int n2, const int nf)
__global__ void add5s4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ e, const T c1, const T c2, const T c3, const T c4, const int n)
__global__ void masked_gather_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void pwmax_vec2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void pwmin_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void pwmax_sca3_kernel(T *__restrict__ a, const T *__restrict b, const T c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void masked_copy_kernel_0(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glmin_kernel(const T *a, const T pinf, T *buf_h, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void pwmin_sca2_kernel(T *__restrict__ a, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glmax_kernel(const T *a, const T ninf, T *buf_h, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_scatter_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int n_mask)
__global__ void reduce_min_kernel(T *bufred, const T pinf, const int n)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void invcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
__inline__ __device__ T reduce_min_warp(T val)
Object for handling masks in Neko.