35 #ifndef __GS_GS_KERNELS__
36 #define __GS_GS_KERNELS__
42 template<
typename T >
46 const int * __restrict__ dg,
47 const T * __restrict__
u,
49 const int * __restrict__ gd,
51 const int * __restrict__ b,
52 const int * __restrict__ bo) {
54 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
55 const int str = blockDim.x * gridDim.x;
57 for (
int i = idx;
i < nb;
i += str) {
58 const int blk_len = b[
i];
61 for (
int j = 1;
j < blk_len;
j++) {
62 tmp +=
u[gd[k +
j] - 1];
68 for (
int i = ((abs(o) - 1) + idx);
i < m ;
i += str) {
69 v[dg[
i] - 1] =
u[gd[
i] - 1];
74 for (
int i = ((o - 1) + idx);
i < m ;
i += str) {
75 T tmp =
u[gd[
i] - 1] +
u[gd[
i+1] - 1];
87 template<
typename T >
91 const int * __restrict__ dg,
92 const T * __restrict__
u,
94 const int * __restrict__ gd,
96 const int * __restrict__ b,
97 const int * __restrict__ bo) {
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
102 for (
int i = idx;
i < nb;
i += str) {
103 const int blk_len = b[
i];
105 T tmp =
u[gd[k] - 1];
106 for (
int j = 1;
j < blk_len;
j++) {
107 tmp *=
u[gd[k +
j] - 1];
113 for (
int i = ((abs(o) - 1) + idx);
i < m ;
i += str) {
114 v[dg[
i] - 1] =
u[gd[
i] - 1];
119 for (
int i = ((o - 1) + idx);
i < m ;
i += str) {
120 T tmp =
u[gd[
i] - 1] *
u[gd[
i+1] - 1];
132 template<
typename T >
136 const int * __restrict__ dg,
137 const T * __restrict__
u,
139 const int * __restrict__ gd,
141 const int *__restrict__ b,
142 const int *__restrict__ bo) {
144 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
145 const int str = blockDim.x * gridDim.x;
147 for (
int i = idx;
i < nb;
i += str) {
148 const int blk_len = b[
i];
150 T tmp =
u[gd[k] - 1];
151 for (
int j = 1;
j < blk_len;
j++) {
152 tmp = min(
u[gd[k +
j] - 1], tmp);
158 for (
int i = ((abs(o) - 1) + idx);
i < m ;
i += str) {
159 v[dg[
i] - 1] =
u[gd[
i] - 1];
164 for (
int i = ((o - 1) + idx);
i < m ;
i += str) {
165 T tmp = min(
u[gd[
i] - 1],
u[gd[
i+1] - 1]);
177 template<
typename T >
181 const int * __restrict__ dg,
182 const T * __restrict__
u,
184 const int * __restrict__ gd,
186 const int *__restrict__ b,
187 const int *__restrict__ bo) {
189 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
190 const int str = blockDim.x * gridDim.x;
192 for (
int i = idx;
i < nb;
i += str) {
193 const int blk_len = b[
i];
195 T tmp =
u[gd[k] - 1];
196 for (
int j = 1;
j < blk_len;
j++) {
197 tmp =
max(
u[gd[k +
j] - 1], tmp);
203 for (
int i = ((abs(o) - 1) + idx);
i < m ;
i += str) {
204 v[dg[
i] - 1] =
u[gd[
i] - 1];
209 for (
int i = ((o - 1) + idx);
i < m ;
i += str) {
210 T tmp =
max(
u[gd[
i] - 1],
u[gd[
i+1] - 1]);
222 template<
typename T >
225 const int * __restrict__ dg,
228 const int * __restrict__ gd,
230 const int *__restrict__ b,
231 const int *__restrict__ bo) {
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
236 for (
int i = idx;
i < nb;
i += str) {
237 const int blk_len = b[
i];
239 T tmp =
v[dg[k] - 1];
240 for (
int j = 0;
j < blk_len;
j++) {
241 u[gd[k +
j] - 1] = tmp;
245 const int facet_offset = bo[nb - 1] + b[nb - 1];
247 for (
int i = ((facet_offset - 1) + idx);
i < m;
i += str) {
248 u[gd[
i] - 1] =
v[dg[
i] - 1];
253 template<
typename T >
255 T * __restrict__
buf,
256 const int32_t * __restrict__ dof,
259 const int j = threadIdx.x + blockDim.x * blockIdx.x;
268 template<
typename T >
270 const T * __restrict__
buf,
271 const int32_t * __restrict__ dof,
274 const int j = threadIdx.x + blockDim.x * blockIdx.x;
279 const int32_t idx = dof[
j];
280 const T val =
buf[
j];
282 #if __CUDA_ARCH__ >= 600
283 atomicAdd(&
u[-idx-1], val);
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void gather_kernel_min(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void scatter_kernel(T *__restrict__ v, const int m, const int *__restrict__ dg, T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gather_kernel_mul(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gs_pack_kernel(const T *__restrict__ u, T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
__global__ void gather_kernel_add(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)
__global__ void gs_unpack_add_kernel(T *__restrict__ u, const T *__restrict__ buf, const int32_t *__restrict__ dof, const int n)
__global__ void gather_kernel_max(T *__restrict__ v, const int m, const int o, const int *__restrict__ dg, const T *__restrict__ u, const int n, const int *__restrict__ gd, const int nb, const int *__restrict__ b, const int *__restrict__ bo)