1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
48 for (
int i = idx;
i < n;
i +=
str) {
66 for (
int i = idx;
i < m;
i +=
str) {
67 a[mask[
i+1]-1] = b[mask[
i+1]-1];
84 for (
int i = idx;
i < m;
i +=
str) {
85 a[
i] = b[mask[
i+1]-1];
108template<
typename T >
117 for (
int i = idx;
i < n;
i +=
str) {
125template<
typename T >
133 for (
int i = idx;
i < n;
i +=
str) {
141template<
typename T >
150 for (
int i = idx;
i < n;
i +=
str) {
158template<
typename T >
166 for (
int i = idx;
i < n;
i +=
str) {
174template<
typename T >
182 for (
int i = idx;
i < n;
i +=
str) {
190template<
typename T >
199 for (
int i = idx;
i < n;
i +=
str) {
207template<
typename T >
217 for (
int i = idx;
i < n;
i +=
str) {
218 a[
i] = b[
i] + c[
i] + d[
i];
225template<
typename T >
234 for (
int i = idx;
i < n;
i +=
str) {
242template<
typename T >
253 for (
int i = idx;
i < n;
i+=
str) {
256 tmp += p[
j][
i]*alpha[
j];
265template<
typename T >
274 for (
int i = idx;
i < n;
i +=
str) {
282template<
typename T >
291 for (
int i = idx;
i < n;
i +=
str) {
299template<
typename T >
310 for (
int i = idx;
i < n;
i +=
str) {
318template<
typename T >
326 for (
int i = idx;
i < n;
i +=
str) {
334template<
typename T >
342 for (
int i = idx;
i < n;
i +=
str) {
350template<
typename T >
358 for (
int i = idx;
i < n;
i +=
str) {
366template<
typename T >
375 for (
int i = idx;
i < n;
i +=
str) {
383template<
typename T >
392 for (
int i = idx;
i < n;
i +=
str) {
393 a[
i] =
a[
i] - b[
i] * c[
i];
400template<
typename T >
408 for (
int i = idx;
i < n;
i +=
str) {
416template<
typename T >
425 for (
int i = idx;
i < n;
i +=
str) {
433template<
typename T >
442 for (
int i = idx;
i < n;
i +=
str) {
443 a[
i] =
a[
i] + b[
i] * c[
i];
451template<
typename T >
461 for (
int i = idx;
i < n;
i +=
str) {
462 a[
i] =
a[
i] + b[
i] * c[
i] * d[
i];
470template<
typename T >
483 for (
int i = idx;
i < n;
i +=
str) {
492template<
typename T >
507 for (
int i = idx;
i < n;
i +=
str) {
510 u3[
i] = v1[
i]*w2[
i] - v2[
i]*w1[
i];
531template<
typename T >
537 for (
int i = idx;
i<n ;
i +=
str)
563template<
typename T >
574 for (
int i=idx ;
i<n ;
i+=step)
597template<
typename T >
612 for (
int i = idx;
i < n;
i+=
str) {
632template<
typename T >
647 for (
int i = idx;
i < n;
i+=
str) {
674template<
typename T >
688 for (
int i = idx;
i < n;
i+=
str) {
708template<
typename T >
721 for (
int i = idx;
i<n ;
i +=
str)
743template<
typename T >
750 for (
int i = idx;
i < n;
i +=
str) {
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)