36 #include <OpenCL/cl.h>
48 #include "math_kernel.cl.h"
55 b, a, 0, 0, (*n) *
sizeof(
real),
68 cl_kernel kernel = clCreateKernel(
math_program,
"masked_copy_kernel", &err);
71 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
72 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
73 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mask));
74 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
75 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), m));
77 const int nb = ((*n) + 256 - 1) / 256;
78 const size_t global_item_size = 256 * nb;
79 const size_t local_item_size = 256;
82 NULL, &global_item_size, &local_item_size,
95 a, &zero,
sizeof(
real), 0,
96 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
97 CL_CHECK(clWaitForEvents(1, &wait_kern));
108 a, &one,
sizeof(
real), 0,
109 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
110 CL_CHECK(clWaitForEvents(1, &wait_kern));
122 cl_kernel kernel = clCreateKernel(
math_program,
"cmult2_kernel", &err);
125 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
126 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
128 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
130 const int nb = ((*n) + 256 - 1) / 256;
131 const size_t global_item_size = 256 * nb;
132 const size_t local_item_size = 256;
135 NULL, &global_item_size, &local_item_size,
149 cl_kernel kernel = clCreateKernel(
math_program,
"cmult_kernel", &err);
152 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
154 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
156 const int nb = ((*n) + 256 - 1) / 256;
157 const size_t global_item_size = 256 * nb;
158 const size_t local_item_size = 256;
161 NULL, &global_item_size, &local_item_size,
174 cl_kernel kernel = clCreateKernel(
math_program,
"cadd_kernel", &err);
177 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
179 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
181 const int nb = ((*n) + 256 - 1) / 256;
182 const size_t global_item_size = 256 * nb;
183 const size_t local_item_size = 256;
186 NULL, &global_item_size, &local_item_size,
199 cl_kernel kernel = clCreateKernel(
math_program,
"cfill_kernel", &err);
202 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
204 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
206 const int nb = ((*n) + 256 - 1) / 256;
207 const size_t global_item_size = 256 * nb;
208 const size_t local_item_size = 256;
211 NULL, &global_item_size, &local_item_size,
225 cl_kernel kernel = clCreateKernel(
math_program,
"add2_kernel", &err);
228 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
229 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
230 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
232 const int nb = ((*n) + 256 - 1) / 256;
233 const size_t global_item_size = 256 * nb;
234 const size_t local_item_size = 256;
237 NULL, &global_item_size, &local_item_size,
252 cl_kernel kernel = clCreateKernel(
math_program,
"add2s1_kernel", &err);
255 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
256 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
258 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
260 const int nb = ((*n) + 256 - 1) / 256;
261 const size_t global_item_size = 256 * nb;
262 const size_t local_item_size = 256;
265 NULL, &global_item_size, &local_item_size,
280 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_kernel", &err);
283 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
284 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
286 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
288 const int nb = ((*n) + 256 - 1) / 256;
289 const size_t global_item_size = 256 * nb;
290 const size_t local_item_size = 256;
293 NULL, &global_item_size, &local_item_size,
309 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_many_kernel", &err);
312 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
x));
313 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &p));
314 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &alpha));
315 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int),
j));
316 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
318 const int nb = ((*n) + 256 - 1) / 256;
319 const size_t global_item_size = 256 * nb;
320 const size_t local_item_size = 256;
323 NULL, &global_item_size, &local_item_size,
339 cl_kernel kernel = clCreateKernel(
math_program,
"addsqr2s2_kernel", &err);
342 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
343 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
345 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
347 const int nb = ((*n) + 256 - 1) / 256;
348 const size_t global_item_size = 256 * nb;
349 const size_t local_item_size = 256;
352 NULL, &global_item_size, &local_item_size,
366 cl_kernel kernel = clCreateKernel(
math_program,
"add3s2_kernel", &err);
369 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
370 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
371 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
374 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
376 const int nb = ((*n) + 256 - 1) / 256;
377 const size_t global_item_size = 256 * nb;
378 const size_t local_item_size = 256;
381 NULL, &global_item_size, &local_item_size,
395 cl_kernel kernel = clCreateKernel(
math_program,
"invcol1_kernel", &err);
397 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
398 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), n));
400 const int nb = ((*n) + 256 - 1) / 256;
401 const size_t global_item_size = 256 * nb;
402 const size_t local_item_size = 256;
405 NULL, &global_item_size, &local_item_size,
419 cl_kernel kernel = clCreateKernel(
math_program,
"invcol2_kernel", &err);
422 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
423 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
424 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
426 const int nb = ((*n) + 256 - 1) / 256;
427 const size_t global_item_size = 256 * nb;
428 const size_t local_item_size = 256;
431 NULL, &global_item_size, &local_item_size,
445 cl_kernel kernel = clCreateKernel(
math_program,
"col2_kernel", &err);
448 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
449 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
450 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
452 const int nb = ((*n) + 256 - 1) / 256;
453 const size_t global_item_size = 256 * nb;
454 const size_t local_item_size = 256;
457 NULL, &global_item_size, &local_item_size,
471 cl_kernel kernel = clCreateKernel(
math_program,
"col3_kernel", &err);
474 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
475 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
476 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
477 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
479 const int nb = ((*n) + 256 - 1) / 256;
480 const size_t global_item_size = 256 * nb;
481 const size_t local_item_size = 256;
484 NULL, &global_item_size, &local_item_size,
498 cl_kernel kernel = clCreateKernel(
math_program,
"subcol3_kernel", &err);
501 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
502 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
503 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
504 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
506 const int nb = ((*n) + 256 - 1) / 256;
507 const size_t global_item_size = 256 * nb;
508 const size_t local_item_size = 256;
511 NULL, &global_item_size, &local_item_size,
525 cl_kernel kernel = clCreateKernel(
math_program,
"sub2_kernel", &err);
528 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
529 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
530 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
532 const int nb = ((*n) + 256 - 1) / 256;
533 const size_t global_item_size = 256 * nb;
534 const size_t local_item_size = 256;
537 NULL, &global_item_size, &local_item_size,
551 cl_kernel kernel = clCreateKernel(
math_program,
"sub3_kernel", &err);
554 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
555 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
556 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
557 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
559 const int nb = ((*n) + 256 - 1) / 256;
560 const size_t global_item_size = 256 * nb;
561 const size_t local_item_size = 256;
564 NULL, &global_item_size, &local_item_size,
578 cl_kernel kernel = clCreateKernel(
math_program,
"addcol3_kernel", &err);
581 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
582 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
583 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
584 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
586 const int nb = ((*n) + 256 - 1) / 256;
587 const size_t global_item_size = 256 * nb;
588 const size_t local_item_size = 256;
591 NULL, &global_item_size, &local_item_size,
605 cl_kernel kernel = clCreateKernel(
math_program,
"addcol4_kernel", &err);
608 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
609 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
610 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
611 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &d));
612 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
614 const int nb = ((*n) + 256 - 1) / 256;
615 const size_t global_item_size = 256 * nb;
616 const size_t local_item_size = 256;
619 NULL, &global_item_size, &local_item_size,
629 void *v1,
void *v2,
void *v3,
int *n) {
635 cl_kernel kernel = clCreateKernel(
math_program,
"vdot3_kernel", &err);
638 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &dot));
639 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &u1));
640 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &u2));
641 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &u3));
642 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &v1));
643 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(cl_mem), (
void *) &v2));
644 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &v3));
645 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), n));
647 const int nb = ((*n) + 256 - 1) / 256;
648 const size_t global_item_size = 256 * nb;
649 const size_t local_item_size = 256;
652 NULL, &global_item_size, &local_item_size,
673 const int nb = ((*n) + 256 - 1) / 256;
674 const size_t global_item_size = 256 * nb;
675 const size_t local_item_size = 256;
686 nb *
sizeof(
real), NULL, &err);
690 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_kernel", &err);
693 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
694 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
695 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
697 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
700 NULL, &global_item_size, &local_item_size,
701 0, NULL, &kern_wait));
708 for (
i = 0;
i < nb;
i++) {
732 const int nt = 256 / pow2;
733 const int nb = ((*n) + nt - 1) / nt;
734 const size_t local_item_size[2] = {nt, pow2};
735 const size_t global_item_size[2] = {nb * nt, pow2};
746 (*
j) * nb *
sizeof(
real), NULL, &err);
750 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_many_kernel", &err);
753 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
w));
754 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &
v));
755 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mult));
757 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int),
j));
758 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
761 NULL, global_item_size, local_item_size,
762 0, NULL, &kern_wait));
766 bufred, 1, &kern_wait, NULL));
768 for (k = 0; k < (*j); k++) {
772 for (
i = 0;
i < nb;
i++) {
773 for (k = 0; k < (*j); k++) {
791 const int nb = ((*n) + 256 - 1) / 256;
792 const size_t global_item_size = 256 * nb;
793 const size_t local_item_size = 256;
797 cl_kernel kernel = clCreateKernel(
math_program,
"glsc2_kernel", &err);
800 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
801 nb *
sizeof(
real), NULL, &err);
804 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
805 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
806 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &buf_d));
807 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
810 NULL, &global_item_size, &local_item_size,
811 0, NULL, &kern_wait));
815 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
818 for (
i = 0;
i < nb;
i++) {
823 CL_CHECK(clReleaseMemObject(buf_d));
840 const int nb = ((*n) + 256 - 1) / 256;
841 const size_t global_item_size = 256 * nb;
842 const size_t local_item_size = 256;
846 cl_kernel kernel = clCreateKernel(
math_program,
"glsum_kernel", &err);
849 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
850 nb *
sizeof(
real), NULL, &err);
853 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
854 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &buf_d));
855 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
858 NULL, &global_item_size, &local_item_size,
859 0, NULL, &kern_wait));
863 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
866 for (
i = 0;
i < nb;
i++) {
871 CL_CHECK(clReleaseMemObject(buf_d));
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_addcol3(void *a, void *b, void *c, int *n)
void opencl_invcol1(void *a, int *n)
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
void opencl_cmult(void *a, real *c, int *n)
void opencl_sub3(void *a, void *b, void *c, int *n)
void opencl_rone(void *a, int *n)
void opencl_cadd(void *a, real *c, int *n)
void opencl_cmult2(void *a, void *b, real *c, int *n)
real opencl_glsc3(void *a, void *b, void *c, int *n)
void opencl_add2s2(void *a, void *b, real *c1, int *n)
void opencl_rzero(void *a, int *n)
void opencl_sub2(void *a, void *b, int *n)
void opencl_col2(void *a, void *b, int *n)
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
void opencl_col3(void *a, void *b, void *c, int *n)
void opencl_subcol3(void *a, void *b, void *c, int *n)
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
void opencl_invcol2(void *a, void *b, int *n)
void opencl_add2(void *a, void *b, int *n)
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void opencl_cfill(void *a, real *c, int *n)
void opencl_add2s1(void *a, void *b, real *c1, int *n)
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
real opencl_glsc2(void *a, void *b, int *n)
real opencl_glsum(void *a, int *n)
void opencl_copy(void *a, void *b, int *n)