35#ifndef __GS_GS_KERNELS__
36#define __GS_GS_KERNELS__
57 for (
int i = idx;
i <
nb;
i +=
str) {
62 tmp +=
u[
gd[
k +
j] - 1];
68 for (
int i = ((
abs(
o) - 1) + idx);
i < m ;
i +=
str) {
74 for (
int i = ((
o - 1) + idx);
i < m ;
i +=
str) {
102 for (
int i = idx;
i <
nb;
i +=
str) {
105 T tmp =
u[
gd[
k] - 1];
107 tmp *=
u[
gd[
k +
j] - 1];
113 for (
int i = ((
abs(
o) - 1) + idx);
i < m ;
i +=
str) {
119 for (
int i = ((
o - 1) + idx);
i < m ;
i +=
str) {
132template<
typename T >
147 for (
int i = idx;
i <
nb;
i +=
str) {
150 T tmp =
u[
gd[
k] - 1];
158 for (
int i = ((
abs(
o) - 1) + idx);
i < m ;
i +=
str) {
164 for (
int i = ((
o - 1) + idx);
i < m ;
i +=
str) {
177template<
typename T >
192 for (
int i = idx;
i <
nb;
i +=
str) {
195 T tmp =
u[
gd[
k] - 1];
203 for (
int i = ((
abs(
o) - 1) + idx);
i < m ;
i +=
str) {
209 for (
int i = ((
o - 1) + idx);
i < m ;
i +=
str) {
222template<
typename T >
236 for (
int i = idx;
i <
nb;
i +=
str) {
239 T tmp =
v[
dg[
k] - 1];
241 u[
gd[
k +
j] - 1] = tmp;
253template<
typename T >
268template<
typename T >
282#if __CUDA_ARCH__ >= 600
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void gather_kernel_min(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void scatter_kernel(T *__restrict__ v, const int m, const int *__restrict__ dg, T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gather_kernel_mul(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gs_pack_kernel(const T *__restrict__ u, T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
__global__ void gather_kernel_add(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gs_unpack_add_kernel(T *__restrict__ u, const T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
__global__ void gather_kernel_max(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)