36 #include <OpenCL/cl.h>
48 #include "gs_kernels.cl.h"
59 void *
u,
int *n,
void *gd,
int *nb,
60 void *b,
void *bo,
int *op) {
66 const int nblks = ((*m) + 256 - 1) / 256;
67 const size_t global_item_size = 256 * nblks;
68 const size_t local_item_size = 256;
75 "gather_kernel_add", &err);
78 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
v));
79 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), m));
80 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), o));
81 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &dg));
82 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &
u));
83 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
84 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &gd));
85 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), nb));
86 CL_CHECK(clSetKernelArg(kernel, 8,
sizeof(cl_mem), (
void *) &b));
87 CL_CHECK(clSetKernelArg(kernel, 9,
sizeof(cl_mem), (
void *) &bo));
90 1, NULL, &global_item_size,
91 &local_item_size, 0, NULL, NULL));
97 "gather_kernel_mul", &err);
100 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
v));
101 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), m));
102 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), o));
103 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &dg));
104 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &
u));
105 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
106 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &gd));
107 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), nb));
108 CL_CHECK(clSetKernelArg(kernel, 8,
sizeof(cl_mem), (
void *) &b));
109 CL_CHECK(clSetKernelArg(kernel, 9,
sizeof(cl_mem), (
void *) &bo));
112 1, NULL, &global_item_size,
113 &local_item_size, 0, NULL, NULL));
119 "gather_kernel_min", &err);
122 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
v));
123 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), m));
124 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), o));
125 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &dg));
126 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &
u));
127 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
128 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &gd));
129 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), nb));
130 CL_CHECK(clSetKernelArg(kernel, 8,
sizeof(cl_mem), (
void *) &b));
131 CL_CHECK(clSetKernelArg(kernel, 9,
sizeof(cl_mem), (
void *) &bo));
134 1, NULL, &global_item_size,
135 &local_item_size, 0, NULL, NULL));
141 "gather_kernel_max", &err);
144 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
v));
145 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), m));
146 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), o));
147 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &dg));
148 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &
u));
149 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
150 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &gd));
151 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), nb));
152 CL_CHECK(clSetKernelArg(kernel, 8,
sizeof(cl_mem), (
void *) &b));
153 CL_CHECK(clSetKernelArg(kernel, 9,
sizeof(cl_mem), (
void *) &bo));
156 1, NULL, &global_item_size,
157 &local_item_size, 0, NULL, NULL));
167 void *
u,
int *n,
void *gd,
168 int *nb,
void *b,
void *bo) {
174 cl_kernel kernel = clCreateKernel(
gs_program,
"scatter_kernel", &err);
177 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
v));
178 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), m));
179 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &dg));
180 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &
u));
181 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
182 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(cl_mem), (
void *) &gd));
183 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(
int), nb));
184 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(cl_mem), (
void *) &b));
185 CL_CHECK(clSetKernelArg(kernel, 8,
sizeof(cl_mem), (
void *) &bo));
187 const int nblks = ((*m) + 256 - 1) / 256;
188 const size_t global_item_size = 256 * nblks;
189 const size_t local_item_size = 256;
192 NULL, &global_item_size, &local_item_size,
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
void opencl_gather_kernel(void *v, int *m, int *o, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo, int *op)
void opencl_scatter_kernel(void *v, int *m, void *dg, void *u, int *n, void *gd, int *nb, void *b, void *bo)
void opencl_kernel_jit(const char *kernel, cl_program *program)