49#include "math_kernel.cl.h"
56 b,
a, 0, 0, (*n) *
sizeof(
real),
80 const int nb = ((*m) + 256 - 1) / 256;
110 const int nb = ((*m) + 256 - 1) / 256;
141 const int nb = ((*n) + 256 - 1) / 256;
163 "masked_gather_copy_aligned_kernel", &
err);
172 const int nb = ((*n) + 256 - 1) / 256;
187 int *n1,
int *n2,
int *lx,
int *ly,
196 "face_masked_gather_copy_kernel", &
err);
210 const int nb = ((*m) + 256 - 1) / 256;
241 const int nb = ((*n) + 256 - 1) / 256;
272 const int nb = ((*n) + 256 - 1) / 256;
302 const int nb = ((*mask_size) + 256 - 1) / 256;
352 const int nb = ((*n) + 256 - 1) / 256;
380 const int nb = ((*n) + 256 - 1) / 256;
406 const int nb = ((*n) + 256 - 1) / 256;
434 const int nb = ((*n) + 256 - 1) / 256;
460 const int nb = ((*n) + 256 - 1) / 256;
488 const int nb = ((*n) + 256 - 1) / 256;
516 const int nb = ((*n) + 256 - 1) / 256;
542 const int nb = ((*n) + 256 - 1) / 256;
569 const int nb = ((*n) + 256 - 1) / 256;
598 const int nb = ((*n) + 256 - 1) / 256;
628 const int nb = ((*n) + 256 - 1) / 256;
658 const int nb = ((*n) + 256 - 1) / 256;
688 const int nb = ((*n) + 256 - 1) / 256;
720 const int nb = ((*n) + 256 - 1) / 256;
751 const int nb = ((*n) + 256 - 1) / 256;
782 const int nb = ((*n) + 256 - 1) / 256;
815 const int nb = ((*n) + 256 - 1) / 256;
851 const int nb = ((*n) + 256 - 1) / 256;
877 const int nb = ((*n) + 256 - 1) / 256;
904 const int nb = ((*n) + 256 - 1) / 256;
931 const int nb = ((*n) + 256 - 1) / 256;
960 const int nb = ((*n) + 256 - 1) / 256;
989 const int nb = ((*n) + 256 - 1) / 256;
1016 const int nb = ((*n) + 256 - 1) / 256;
1045 const int nb = ((*n) + 256 - 1) / 256;
1074 const int nb = ((*n) + 256 - 1) / 256;
1104 const int nb = ((*n) + 256 - 1) / 256;
1134 const int nb = ((*n) + 256 - 1) / 256;
1150 void *v1,
void *v2,
void *v3,
int *n,
1169 const int nb = ((*n) + 256 - 1) / 256;
1185 void *v1,
void *v2,
void *v3,
1186 void *w1,
void *w2,
void *
w3,
1208 const int nb = ((*n) + 256 - 1) / 256;
1237 const int nb = ((*n) + 256 - 1) / 256;
1272 for (
i = 0;
i <
nb;
i++) {
1299 const int nt = 256 /
pow2;
1300 const int nb = ((*n) + nt - 1) / nt;
1335 for (
k = 0;
k < (*j);
k++) {
1339 for (
i = 0;
i <
nb;
i++) {
1340 for (
k = 0;
k < (*j);
k++) {
1360 const int nb = ((*n) + 256 - 1) / 256;
1386 for (
i = 0;
i <
nb;
i++) {
1410 const int nb = ((*n) + 256 - 1) / 256;
1436 for (
i = 0;
i <
nb;
i++) {
1460 const int nb = ((*n) + 256 - 1) / 256;
1485 for (
i = 0;
i <
nb;
i++) {
1509 const int nb = ((*n) + 256 - 1) / 256;
1534 for (
i = 1;
i <
nb;
i++) {
1558 const int nb = ((*n) + 256 - 1) / 256;
1583 for (
i = 1;
i <
nb;
i++) {
1611 const int nb = ((*n) + 256 - 1) / 256;
1636 const int nb = ((*n) + 256 - 1) / 256;
1663 const int nb = ((*n) + 256 - 1) / 256;
1692 const int nb = ((*n) + 256 - 1) / 256;
1719 const int nb = ((*n) + 256 - 1) / 256;
1748 const int nb = ((*n) + 256 - 1) / 256;
1775 const int nb = ((*n) + 256 - 1) / 256;
1804 const int nb = ((*n) + 256 - 1) / 256;
1831 const int nb = ((*n) + 256 - 1) / 256;
1860 const int nb = ((*n) + 256 - 1) / 256;
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cl_command_queue cmd_queue)
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
void opencl_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
void opencl_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
void opencl_cwrap(void *a, real *min_val, real *max_val, int *n, cl_command_queue cmd_queue)
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
real opencl_glmin(void *a, int *n, cl_command_queue cmd_queue)
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
void opencl_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
real opencl_glmax(void *a, int *n, cl_command_queue cmd_queue)
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Object for handling masks in Neko.