36 #include <OpenCL/cl.h>
48 #include "math_kernel.cl.h"
55 b, a, 0, 0, (*n) *
sizeof(
real),
68 cl_kernel kernel = clCreateKernel(
math_program,
"masked_copy_kernel", &err);
71 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
72 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
73 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mask));
74 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
75 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), m));
77 const int nb = ((*n) + 256 - 1) / 256;
78 const size_t global_item_size = 256 * nb;
79 const size_t local_item_size = 256;
82 NULL, &global_item_size, &local_item_size,
96 cl_kernel kernel = clCreateKernel(
math_program,
"cfill_mask_kernel", &err);
99 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
101 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), size));
102 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &mask));
103 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), mask_size));
105 const int nb = ((*mask_size) + 256 - 1) / 256;
106 const size_t global_item_size = 256 * nb;
107 const size_t local_item_size = 256;
110 NULL, &global_item_size, &local_item_size,
122 a, &zero,
sizeof(
real), 0,
123 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
124 CL_CHECK(clWaitForEvents(1, &wait_kern));
135 a, &one,
sizeof(
real), 0,
136 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
137 CL_CHECK(clWaitForEvents(1, &wait_kern));
149 cl_kernel kernel = clCreateKernel(
math_program,
"cmult2_kernel", &err);
152 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
153 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
155 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
157 const int nb = ((*n) + 256 - 1) / 256;
158 const size_t global_item_size = 256 * nb;
159 const size_t local_item_size = 256;
162 NULL, &global_item_size, &local_item_size,
176 cl_kernel kernel = clCreateKernel(
math_program,
"cmult_kernel", &err);
179 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
181 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
183 const int nb = ((*n) + 256 - 1) / 256;
184 const size_t global_item_size = 256 * nb;
185 const size_t local_item_size = 256;
188 NULL, &global_item_size, &local_item_size,
201 cl_kernel kernel = clCreateKernel(
math_program,
"cadd_kernel", &err);
204 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
206 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
208 const int nb = ((*n) + 256 - 1) / 256;
209 const size_t global_item_size = 256 * nb;
210 const size_t local_item_size = 256;
213 NULL, &global_item_size, &local_item_size,
226 cl_kernel kernel = clCreateKernel(
math_program,
"cadd2_kernel", &err);
229 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
230 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
232 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
234 const int nb = ((*n) + 256 - 1) / 256;
235 const size_t global_item_size = 256 * nb;
236 const size_t local_item_size = 256;
239 NULL, &global_item_size, &local_item_size,
252 cl_kernel kernel = clCreateKernel(
math_program,
"cfill_kernel", &err);
255 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
257 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
259 const int nb = ((*n) + 256 - 1) / 256;
260 const size_t global_item_size = 256 * nb;
261 const size_t local_item_size = 256;
264 NULL, &global_item_size, &local_item_size,
278 cl_kernel kernel = clCreateKernel(
math_program,
"add2_kernel", &err);
281 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
282 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
283 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
285 const int nb = ((*n) + 256 - 1) / 256;
286 const size_t global_item_size = 256 * nb;
287 const size_t local_item_size = 256;
290 NULL, &global_item_size, &local_item_size,
304 cl_kernel kernel = clCreateKernel(
math_program,
"add3_kernel", &err);
307 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
308 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
309 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
310 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
312 const int nb = ((*n) + 256 - 1) / 256;
313 const size_t global_item_size = 256 * nb;
314 const size_t local_item_size = 256;
317 NULL, &global_item_size, &local_item_size,
332 cl_kernel kernel = clCreateKernel(
math_program,
"add2s1_kernel", &err);
335 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
336 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
338 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
340 const int nb = ((*n) + 256 - 1) / 256;
341 const size_t global_item_size = 256 * nb;
342 const size_t local_item_size = 256;
345 NULL, &global_item_size, &local_item_size,
360 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_kernel", &err);
363 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
364 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
366 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
368 const int nb = ((*n) + 256 - 1) / 256;
369 const size_t global_item_size = 256 * nb;
370 const size_t local_item_size = 256;
373 NULL, &global_item_size, &local_item_size,
389 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_many_kernel", &err);
392 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
x));
393 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &p));
394 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &alpha));
395 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int),
j));
396 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
398 const int nb = ((*n) + 256 - 1) / 256;
399 const size_t global_item_size = 256 * nb;
400 const size_t local_item_size = 256;
403 NULL, &global_item_size, &local_item_size,
419 cl_kernel kernel = clCreateKernel(
math_program,
"addsqr2s2_kernel", &err);
422 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
423 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
425 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
427 const int nb = ((*n) + 256 - 1) / 256;
428 const size_t global_item_size = 256 * nb;
429 const size_t local_item_size = 256;
432 NULL, &global_item_size, &local_item_size,
446 cl_kernel kernel = clCreateKernel(
math_program,
"add3s2_kernel", &err);
449 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
450 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
451 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
454 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
456 const int nb = ((*n) + 256 - 1) / 256;
457 const size_t global_item_size = 256 * nb;
458 const size_t local_item_size = 256;
461 NULL, &global_item_size, &local_item_size,
475 cl_kernel kernel = clCreateKernel(
math_program,
"invcol1_kernel", &err);
477 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
478 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), n));
480 const int nb = ((*n) + 256 - 1) / 256;
481 const size_t global_item_size = 256 * nb;
482 const size_t local_item_size = 256;
485 NULL, &global_item_size, &local_item_size,
499 cl_kernel kernel = clCreateKernel(
math_program,
"invcol2_kernel", &err);
502 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
503 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
504 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
506 const int nb = ((*n) + 256 - 1) / 256;
507 const size_t global_item_size = 256 * nb;
508 const size_t local_item_size = 256;
511 NULL, &global_item_size, &local_item_size,
525 cl_kernel kernel = clCreateKernel(
math_program,
"col2_kernel", &err);
528 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
529 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
530 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
532 const int nb = ((*n) + 256 - 1) / 256;
533 const size_t global_item_size = 256 * nb;
534 const size_t local_item_size = 256;
537 NULL, &global_item_size, &local_item_size,
551 cl_kernel kernel = clCreateKernel(
math_program,
"col3_kernel", &err);
554 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
555 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
556 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
557 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
559 const int nb = ((*n) + 256 - 1) / 256;
560 const size_t global_item_size = 256 * nb;
561 const size_t local_item_size = 256;
564 NULL, &global_item_size, &local_item_size,
578 cl_kernel kernel = clCreateKernel(
math_program,
"subcol3_kernel", &err);
581 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
582 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
583 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
584 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
586 const int nb = ((*n) + 256 - 1) / 256;
587 const size_t global_item_size = 256 * nb;
588 const size_t local_item_size = 256;
591 NULL, &global_item_size, &local_item_size,
605 cl_kernel kernel = clCreateKernel(
math_program,
"sub2_kernel", &err);
608 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
609 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
610 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
612 const int nb = ((*n) + 256 - 1) / 256;
613 const size_t global_item_size = 256 * nb;
614 const size_t local_item_size = 256;
617 NULL, &global_item_size, &local_item_size,
631 cl_kernel kernel = clCreateKernel(
math_program,
"sub3_kernel", &err);
634 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
635 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
636 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
637 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
639 const int nb = ((*n) + 256 - 1) / 256;
640 const size_t global_item_size = 256 * nb;
641 const size_t local_item_size = 256;
644 NULL, &global_item_size, &local_item_size,
658 cl_kernel kernel = clCreateKernel(
math_program,
"addcol3_kernel", &err);
661 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
662 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
663 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
664 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
666 const int nb = ((*n) + 256 - 1) / 256;
667 const size_t global_item_size = 256 * nb;
668 const size_t local_item_size = 256;
671 NULL, &global_item_size, &local_item_size,
685 cl_kernel kernel = clCreateKernel(
math_program,
"addcol4_kernel", &err);
688 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
689 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
690 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
691 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &d));
692 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
694 const int nb = ((*n) + 256 - 1) / 256;
695 const size_t global_item_size = 256 * nb;
696 const size_t local_item_size = 256;
699 NULL, &global_item_size, &local_item_size,
709 void *v1,
void *v2,
void *v3,
int *n) {
715 cl_kernel kernel = clCreateKernel(
math_program,
"vdot3_kernel", &err);
718 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &dot));
719 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &u1));
720 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &u2));
721 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &u3));
722 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &v1));
723 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(cl_mem), (
void *) &v2));
724 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &v3));
725 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), n));
727 const int nb = ((*n) + 256 - 1) / 256;
728 const size_t global_item_size = 256 * nb;
729 const size_t local_item_size = 256;
732 NULL, &global_item_size, &local_item_size,
753 const int nb = ((*n) + 256 - 1) / 256;
754 const size_t global_item_size = 256 * nb;
755 const size_t local_item_size = 256;
766 nb *
sizeof(
real), NULL, &err);
770 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_kernel", &err);
773 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
774 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
775 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
777 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
780 NULL, &global_item_size, &local_item_size,
781 0, NULL, &kern_wait));
788 for (
i = 0;
i < nb;
i++) {
812 const int nt = 256 / pow2;
813 const int nb = ((*n) + nt - 1) / nt;
814 const size_t local_item_size[2] = {nt, pow2};
815 const size_t global_item_size[2] = {nb * nt, pow2};
826 (*
j) * nb *
sizeof(
real), NULL, &err);
830 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_many_kernel", &err);
833 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
w));
834 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &
v));
835 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mult));
837 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int),
j));
838 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
841 NULL, global_item_size, local_item_size,
842 0, NULL, &kern_wait));
846 bufred, 1, &kern_wait, NULL));
848 for (k = 0; k < (*j); k++) {
852 for (
i = 0;
i < nb;
i++) {
853 for (k = 0; k < (*j); k++) {
871 const int nb = ((*n) + 256 - 1) / 256;
872 const size_t global_item_size = 256 * nb;
873 const size_t local_item_size = 256;
877 cl_kernel kernel = clCreateKernel(
math_program,
"glsc2_kernel", &err);
880 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
881 nb *
sizeof(
real), NULL, &err);
884 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
885 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
886 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &buf_d));
887 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
890 NULL, &global_item_size, &local_item_size,
891 0, NULL, &kern_wait));
895 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
898 for (
i = 0;
i < nb;
i++) {
903 CL_CHECK(clReleaseMemObject(buf_d));
920 const int nb = ((*n) + 256 - 1) / 256;
921 const size_t global_item_size = 256 * nb;
922 const size_t local_item_size = 256;
926 cl_kernel kernel = clCreateKernel(
math_program,
"glsum_kernel", &err);
929 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
930 nb *
sizeof(
real), NULL, &err);
933 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
934 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &buf_d));
935 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
938 NULL, &global_item_size, &local_item_size,
939 0, NULL, &kern_wait));
943 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
946 for (
i = 0;
i < nb;
i++) {
951 CL_CHECK(clReleaseMemObject(buf_d));
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_add3(void *a, void *b, void *c, int *n)
void opencl_addcol3(void *a, void *b, void *c, int *n)
void opencl_invcol1(void *a, int *n)
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
void opencl_cmult(void *a, real *c, int *n)
void opencl_sub3(void *a, void *b, void *c, int *n)
void opencl_rone(void *a, int *n)
void opencl_cadd(void *a, real *c, int *n)
void opencl_cmult2(void *a, void *b, real *c, int *n)
real opencl_glsc3(void *a, void *b, void *c, int *n)
void opencl_add2s2(void *a, void *b, real *c1, int *n)
void opencl_rzero(void *a, int *n)
void opencl_sub2(void *a, void *b, int *n)
void opencl_col2(void *a, void *b, int *n)
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
void opencl_col3(void *a, void *b, void *c, int *n)
void opencl_subcol3(void *a, void *b, void *c, int *n)
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
void opencl_invcol2(void *a, void *b, int *n)
void opencl_cadd2(void *a, void *b, real *c, int *n)
void opencl_add2(void *a, void *b, int *n)
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size)
void opencl_cfill(void *a, real *c, int *n)
void opencl_add2s1(void *a, void *b, real *c1, int *n)
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
real opencl_glsc2(void *a, void *b, int *n)
real opencl_glsum(void *a, int *n)
void opencl_copy(void *a, void *b, int *n)