36 #include <OpenCL/cl.h>
48 #include "math_kernel.cl.h"
55 b, a, 0, 0, (*n) *
sizeof(
real),
68 cl_kernel kernel = clCreateKernel(
math_program,
"masked_copy_kernel", &err);
71 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
72 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
73 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mask));
74 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
75 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), m));
77 const int nb = ((*n) + 256 - 1) / 256;
78 const size_t global_item_size = 256 * nb;
79 const size_t local_item_size = 256;
82 NULL, &global_item_size, &local_item_size,
96 cl_kernel kernel = clCreateKernel(
math_program,
"cfill_mask_kernel", &err);
99 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
101 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), size));
102 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &mask));
103 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), mask_size));
105 const int nb = ((*mask_size) + 256 - 1) / 256;
106 const size_t global_item_size = 256 * nb;
107 const size_t local_item_size = 256;
110 NULL, &global_item_size, &local_item_size,
122 a, &zero,
sizeof(
real), 0,
123 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
124 CL_CHECK(clWaitForEvents(1, &wait_kern));
135 a, &one,
sizeof(
real), 0,
136 (*n) *
sizeof(
real), 0, NULL, &wait_kern));
137 CL_CHECK(clWaitForEvents(1, &wait_kern));
149 cl_kernel kernel = clCreateKernel(
math_program,
"cmult2_kernel", &err);
152 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
153 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
155 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
157 const int nb = ((*n) + 256 - 1) / 256;
158 const size_t global_item_size = 256 * nb;
159 const size_t local_item_size = 256;
162 NULL, &global_item_size, &local_item_size,
176 cl_kernel kernel = clCreateKernel(
math_program,
"cmult_kernel", &err);
179 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
181 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
183 const int nb = ((*n) + 256 - 1) / 256;
184 const size_t global_item_size = 256 * nb;
185 const size_t local_item_size = 256;
188 NULL, &global_item_size, &local_item_size,
201 cl_kernel kernel = clCreateKernel(
math_program,
"cadd_kernel", &err);
204 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
206 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
208 const int nb = ((*n) + 256 - 1) / 256;
209 const size_t global_item_size = 256 * nb;
210 const size_t local_item_size = 256;
213 NULL, &global_item_size, &local_item_size,
226 cl_kernel kernel = clCreateKernel(
math_program,
"cadd2_kernel", &err);
229 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
230 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
232 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
234 const int nb = ((*n) + 256 - 1) / 256;
235 const size_t global_item_size = 256 * nb;
236 const size_t local_item_size = 256;
239 NULL, &global_item_size, &local_item_size,
252 cl_kernel kernel = clCreateKernel(
math_program,
"cfill_kernel", &err);
255 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
257 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
259 const int nb = ((*n) + 256 - 1) / 256;
260 const size_t global_item_size = 256 * nb;
261 const size_t local_item_size = 256;
264 NULL, &global_item_size, &local_item_size,
278 cl_kernel kernel = clCreateKernel(
math_program,
"add2_kernel", &err);
281 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
282 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
283 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
285 const int nb = ((*n) + 256 - 1) / 256;
286 const size_t global_item_size = 256 * nb;
287 const size_t local_item_size = 256;
290 NULL, &global_item_size, &local_item_size,
304 cl_kernel kernel = clCreateKernel(
math_program,
"add3_kernel", &err);
307 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
308 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
309 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
310 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
312 const int nb = ((*n) + 256 - 1) / 256;
313 const size_t global_item_size = 256 * nb;
314 const size_t local_item_size = 256;
317 NULL, &global_item_size, &local_item_size,
331 cl_kernel kernel = clCreateKernel(
math_program,
"add4_kernel", &err);
334 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
335 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
336 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
337 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &d));
338 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
340 const int nb = ((*n) + 256 - 1) / 256;
341 const size_t global_item_size = 256 * nb;
342 const size_t local_item_size = 256;
345 NULL, &global_item_size, &local_item_size,
360 cl_kernel kernel = clCreateKernel(
math_program,
"add2s1_kernel", &err);
363 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
364 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
366 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
368 const int nb = ((*n) + 256 - 1) / 256;
369 const size_t global_item_size = 256 * nb;
370 const size_t local_item_size = 256;
373 NULL, &global_item_size, &local_item_size,
388 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_kernel", &err);
391 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
392 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
394 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
396 const int nb = ((*n) + 256 - 1) / 256;
397 const size_t global_item_size = 256 * nb;
398 const size_t local_item_size = 256;
401 NULL, &global_item_size, &local_item_size,
417 cl_kernel kernel = clCreateKernel(
math_program,
"add2s2_many_kernel", &err);
420 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
x));
421 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &p));
422 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &alpha));
423 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int),
j));
424 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
426 const int nb = ((*n) + 256 - 1) / 256;
427 const size_t global_item_size = 256 * nb;
428 const size_t local_item_size = 256;
431 NULL, &global_item_size, &local_item_size,
447 cl_kernel kernel = clCreateKernel(
math_program,
"addsqr2s2_kernel", &err);
450 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
451 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
453 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
455 const int nb = ((*n) + 256 - 1) / 256;
456 const size_t global_item_size = 256 * nb;
457 const size_t local_item_size = 256;
460 NULL, &global_item_size, &local_item_size,
474 cl_kernel kernel = clCreateKernel(
math_program,
"add3s2_kernel", &err);
477 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
478 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
479 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
482 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
484 const int nb = ((*n) + 256 - 1) / 256;
485 const size_t global_item_size = 256 * nb;
486 const size_t local_item_size = 256;
489 NULL, &global_item_size, &local_item_size,
503 cl_kernel kernel = clCreateKernel(
math_program,
"invcol1_kernel", &err);
505 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
506 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(
int), n));
508 const int nb = ((*n) + 256 - 1) / 256;
509 const size_t global_item_size = 256 * nb;
510 const size_t local_item_size = 256;
513 NULL, &global_item_size, &local_item_size,
527 cl_kernel kernel = clCreateKernel(
math_program,
"invcol2_kernel", &err);
530 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
531 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
532 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
534 const int nb = ((*n) + 256 - 1) / 256;
535 const size_t global_item_size = 256 * nb;
536 const size_t local_item_size = 256;
539 NULL, &global_item_size, &local_item_size,
553 cl_kernel kernel = clCreateKernel(
math_program,
"col2_kernel", &err);
556 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
557 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
558 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
560 const int nb = ((*n) + 256 - 1) / 256;
561 const size_t global_item_size = 256 * nb;
562 const size_t local_item_size = 256;
565 NULL, &global_item_size, &local_item_size,
579 cl_kernel kernel = clCreateKernel(
math_program,
"col3_kernel", &err);
582 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
583 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
584 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
585 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
587 const int nb = ((*n) + 256 - 1) / 256;
588 const size_t global_item_size = 256 * nb;
589 const size_t local_item_size = 256;
592 NULL, &global_item_size, &local_item_size,
606 cl_kernel kernel = clCreateKernel(
math_program,
"subcol3_kernel", &err);
609 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
610 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
611 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
612 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
614 const int nb = ((*n) + 256 - 1) / 256;
615 const size_t global_item_size = 256 * nb;
616 const size_t local_item_size = 256;
619 NULL, &global_item_size, &local_item_size,
633 cl_kernel kernel = clCreateKernel(
math_program,
"sub2_kernel", &err);
636 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
637 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
638 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
640 const int nb = ((*n) + 256 - 1) / 256;
641 const size_t global_item_size = 256 * nb;
642 const size_t local_item_size = 256;
645 NULL, &global_item_size, &local_item_size,
659 cl_kernel kernel = clCreateKernel(
math_program,
"sub3_kernel", &err);
662 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
663 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
664 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
665 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
667 const int nb = ((*n) + 256 - 1) / 256;
668 const size_t global_item_size = 256 * nb;
669 const size_t local_item_size = 256;
672 NULL, &global_item_size, &local_item_size,
686 cl_kernel kernel = clCreateKernel(
math_program,
"addcol3_kernel", &err);
689 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
690 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
691 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
692 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
694 const int nb = ((*n) + 256 - 1) / 256;
695 const size_t global_item_size = 256 * nb;
696 const size_t local_item_size = 256;
699 NULL, &global_item_size, &local_item_size,
713 cl_kernel kernel = clCreateKernel(
math_program,
"addcol4_kernel", &err);
716 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
717 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
718 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
719 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &d));
720 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
722 const int nb = ((*n) + 256 - 1) / 256;
723 const size_t global_item_size = 256 * nb;
724 const size_t local_item_size = 256;
727 NULL, &global_item_size, &local_item_size,
737 void *v1,
void *v2,
void *v3,
int *n) {
743 cl_kernel kernel = clCreateKernel(
math_program,
"vdot3_kernel", &err);
746 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &dot));
747 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &u1));
748 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &u2));
749 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(cl_mem), (
void *) &u3));
750 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(cl_mem), (
void *) &v1));
751 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(cl_mem), (
void *) &v2));
752 CL_CHECK(clSetKernelArg(kernel, 6,
sizeof(cl_mem), (
void *) &v3));
753 CL_CHECK(clSetKernelArg(kernel, 7,
sizeof(
int), n));
755 const int nb = ((*n) + 256 - 1) / 256;
756 const size_t global_item_size = 256 * nb;
757 const size_t local_item_size = 256;
760 NULL, &global_item_size, &local_item_size,
781 const int nb = ((*n) + 256 - 1) / 256;
782 const size_t global_item_size = 256 * nb;
783 const size_t local_item_size = 256;
794 nb *
sizeof(
real), NULL, &err);
798 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_kernel", &err);
801 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
802 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
803 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &c));
805 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int), n));
808 NULL, &global_item_size, &local_item_size,
809 0, NULL, &kern_wait));
816 for (
i = 0;
i < nb;
i++) {
840 const int nt = 256 / pow2;
841 const int nb = ((*n) + nt - 1) / nt;
842 const size_t local_item_size[2] = {nt, pow2};
843 const size_t global_item_size[2] = {nb * nt, pow2};
854 (*
j) * nb *
sizeof(
real), NULL, &err);
858 cl_kernel kernel = clCreateKernel(
math_program,
"glsc3_many_kernel", &err);
861 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &
w));
862 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &
v));
863 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &mult));
865 CL_CHECK(clSetKernelArg(kernel, 4,
sizeof(
int),
j));
866 CL_CHECK(clSetKernelArg(kernel, 5,
sizeof(
int), n));
869 NULL, global_item_size, local_item_size,
870 0, NULL, &kern_wait));
874 bufred, 1, &kern_wait, NULL));
876 for (k = 0; k < (*j); k++) {
880 for (
i = 0;
i < nb;
i++) {
881 for (k = 0; k < (*j); k++) {
899 const int nb = ((*n) + 256 - 1) / 256;
900 const size_t global_item_size = 256 * nb;
901 const size_t local_item_size = 256;
905 cl_kernel kernel = clCreateKernel(
math_program,
"glsc2_kernel", &err);
908 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
909 nb *
sizeof(
real), NULL, &err);
912 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
913 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &b));
914 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *) &buf_d));
915 CL_CHECK(clSetKernelArg(kernel, 3,
sizeof(
int), n));
918 NULL, &global_item_size, &local_item_size,
919 0, NULL, &kern_wait));
923 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
926 for (
i = 0;
i < nb;
i++) {
931 CL_CHECK(clReleaseMemObject(buf_d));
948 const int nb = ((*n) + 256 - 1) / 256;
949 const size_t global_item_size = 256 * nb;
950 const size_t local_item_size = 256;
954 cl_kernel kernel = clCreateKernel(
math_program,
"glsum_kernel", &err);
957 cl_mem buf_d = clCreateBuffer(
glb_ctx, CL_MEM_READ_WRITE,
958 nb *
sizeof(
real), NULL, &err);
961 CL_CHECK(clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *) &a));
962 CL_CHECK(clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *) &buf_d));
963 CL_CHECK(clSetKernelArg(kernel, 2,
sizeof(
int), n));
966 NULL, &global_item_size, &local_item_size,
967 0, NULL, &kern_wait));
971 0, nb *
sizeof(
real),
buf, 1, &kern_wait, NULL));
974 for (
i = 0;
i < nb;
i++) {
979 CL_CHECK(clReleaseMemObject(buf_d));
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
__global__ void const T *__restrict__ x
void opencl_kernel_jit(const char *kernel, cl_program *program)
void opencl_add3(void *a, void *b, void *c, int *n)
void opencl_addcol3(void *a, void *b, void *c, int *n)
void opencl_invcol1(void *a, int *n)
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
void opencl_cmult(void *a, real *c, int *n)
void opencl_sub3(void *a, void *b, void *c, int *n)
void opencl_rone(void *a, int *n)
void opencl_cadd(void *a, real *c, int *n)
void opencl_cmult2(void *a, void *b, real *c, int *n)
void opencl_add4(void *a, void *b, void *c, void *d, int *n)
real opencl_glsc3(void *a, void *b, void *c, int *n)
void opencl_add2s2(void *a, void *b, real *c1, int *n)
void opencl_rzero(void *a, int *n)
void opencl_sub2(void *a, void *b, int *n)
void opencl_col2(void *a, void *b, int *n)
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
void opencl_col3(void *a, void *b, void *c, int *n)
void opencl_subcol3(void *a, void *b, void *c, int *n)
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
void opencl_invcol2(void *a, void *b, int *n)
void opencl_cadd2(void *a, void *b, real *c, int *n)
void opencl_add2(void *a, void *b, int *n)
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size)
void opencl_cfill(void *a, real *c, int *n)
void opencl_add2s1(void *a, void *b, real *c1, int *n)
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
real opencl_glsc2(void *a, void *b, int *n)
real opencl_glsum(void *a, int *n)
void opencl_copy(void *a, void *b, int *n)