Neko 1.99.2
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
44#include <device/opencl/jit.h>
46#include <device/opencl/check.h>
47
48#include "math_kernel.cl.h"
49
53void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
55 b, a, 0, 0, (*n) * sizeof(real),
56 0, NULL, NULL));
57}
58
62void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
64 cl_int err;
65
66 if (math_program == NULL)
68
69 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
71
72 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
73 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
74 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
75 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
76 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
77
78 const int nb = ((*n) + 256 - 1) / 256;
79 const size_t global_item_size = 256 * nb;
80 const size_t local_item_size = 256;
81
84 0, NULL, NULL));
86
87}
88
92void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
94 cl_int err;
95
96 if (math_program == NULL)
98
99 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
100 &err);
101 CL_CHECK(err);
102
103 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
104 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
105 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
106 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
107 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
108
109 const int nb = ((*n) + 256 - 1) / 256;
110 const size_t global_item_size = 256 * nb;
111 const size_t local_item_size = 256;
112
115 0, NULL, NULL));
117
118}
119
123void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
125 cl_int err;
126
127 if (math_program == NULL)
129
130 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
131 &err);
132 CL_CHECK(err);
133
134 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
135 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
136 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
137 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
138 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
139
140 const int nb = ((*n) + 256 - 1) / 256;
141 const size_t global_item_size = 256 * nb;
142 const size_t local_item_size = 256;
143
146 0, NULL, NULL));
148
149}
150
154void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
156 cl_int err;
157
158 if (math_program == NULL)
160
161 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
162 CL_CHECK(err);
163
164 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
165 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
166 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
167 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
168 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
169
170 const int nb = ((*mask_size) + 256 - 1) / 256;
171 const size_t global_item_size = 256 * nb;
172 const size_t local_item_size = 256;
173
176 0, NULL, NULL));
178}
179
185 real zero = 0.0;
186
188 (*n) * sizeof(real), 0, NULL, &wait_kern));
190}
191
197 real one = 1.0;
198
200 (*n) * sizeof(real), 0, NULL, &wait_kern));
202}
203
207void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
208 cl_int err;
209
210 if (math_program == NULL)
212
213 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
214 CL_CHECK(err);
215
216 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
217 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
218 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
219
220 const int nb = ((*n) + 256 - 1) / 256;
221 const size_t global_item_size = 256 * nb;
222 const size_t local_item_size = 256;
223
226 0, NULL, NULL));
228}
229
233void opencl_cmult2(void *a, void *b, real *c, int *n,
235 cl_int err;
236
237 if (math_program == NULL)
239
240 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
241 CL_CHECK(err);
242
243 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
244 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
245 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
246 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
247
248 const int nb = ((*n) + 256 - 1) / 256;
249 const size_t global_item_size = 256 * nb;
250 const size_t local_item_size = 256;
251
254 0, NULL, NULL));
256}
257
261void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
262 cl_int err;
263
264 if (math_program == NULL)
266
267 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
268 CL_CHECK(err);
269
270 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
271 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
272 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
273
274 const int nb = ((*n) + 256 - 1) / 256;
275 const size_t global_item_size = 256 * nb;
276 const size_t local_item_size = 256;
277
280 0, NULL, NULL));
282}
283
287void opencl_cdiv2(void *a, void *b, real *c, int *n,
289 cl_int err;
290
291 if (math_program == NULL)
293
294 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
295 CL_CHECK(err);
296
297 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
298 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
299 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
300 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
301
302 const int nb = ((*n) + 256 - 1) / 256;
303 const size_t global_item_size = 256 * nb;
304 const size_t local_item_size = 256;
305
308 0, NULL, NULL));
310}
311
315void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
316 cl_int err;
317
318 if (math_program == NULL)
320
321 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
322 CL_CHECK(err);
323
324 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
325 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
326 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
327
328 const int nb = ((*n) + 256 - 1) / 256;
329 const size_t global_item_size = 256 * nb;
330 const size_t local_item_size = 256;
331
334 0, NULL, NULL));
336}
337
341void opencl_cadd2(void *a, void *b, real *c, int *n,
343 cl_int err;
344
345 if (math_program == NULL)
347
348 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
349 CL_CHECK(err);
350
351 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
352 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
353 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
354 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
355
356 const int nb = ((*n) + 256 - 1) / 256;
357 const size_t global_item_size = 256 * nb;
358 const size_t local_item_size = 256;
359
362 0, NULL, NULL));
364}
365
369void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
370 cl_int err;
371
372 if (math_program == NULL)
374
375 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
376 CL_CHECK(err);
377
378 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
379 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
380 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
381
382 const int nb = ((*n) + 256 - 1) / 256;
383 const size_t global_item_size = 256 * nb;
384 const size_t local_item_size = 256;
385
388 0, NULL, NULL));
390}
391
396void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
397 cl_int err;
398
399 if (math_program == NULL)
401
402 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
403 CL_CHECK(err);
404
405 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
406 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
407 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
408
409 const int nb = ((*n) + 256 - 1) / 256;
410 const size_t global_item_size = 256 * nb;
411 const size_t local_item_size = 256;
412
415 0, NULL, NULL));
417}
418
423void opencl_add3(void *a, void *b, void *c, int *n,
425 cl_int err;
426
427 if (math_program == NULL)
429
430 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
431 CL_CHECK(err);
432
433 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
434 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
435 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
436 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
437
438 const int nb = ((*n) + 256 - 1) / 256;
439 const size_t global_item_size = 256 * nb;
440 const size_t local_item_size = 256;
441
444 0, NULL, NULL));
446}
447
452void opencl_add4(void *a, void *b, void *c, void *d, int *n,
454 cl_int err;
455
456 if (math_program == NULL)
458
459 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
460 CL_CHECK(err);
461
462 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
463 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
464 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
465 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
466 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
467
468 const int nb = ((*n) + 256 - 1) / 256;
469 const size_t global_item_size = 256 * nb;
470 const size_t local_item_size = 256;
471
474 0, NULL, NULL));
476}
477
483void opencl_add2s1(void *a, void *b, real *c1, int *n,
485 cl_int err;
486
487 if (math_program == NULL)
489
490 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
491 CL_CHECK(err);
492
493 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
494 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
495 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
496 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
497
498 const int nb = ((*n) + 256 - 1) / 256;
499 const size_t global_item_size = 256 * nb;
500 const size_t local_item_size = 256;
501
504 0, NULL, NULL));
506}
507
513void opencl_add2s2(void *a, void *b, real *c1, int *n,
515 cl_int err;
516
517 if (math_program == NULL)
519
520 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
521 CL_CHECK(err);
522
523 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
524 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
525 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
526 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
527
528 const int nb = ((*n) + 256 - 1) / 256;
529 const size_t global_item_size = 256 * nb;
530 const size_t local_item_size = 256;
531
534 0, NULL, NULL));
536}
537
544void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
546 cl_int err;
547
548 if (math_program == NULL)
550
551 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
552 CL_CHECK(err);
553
554 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
555 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
556 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
557 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
558 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
559
560 const int nb = ((*n) + 256 - 1) / 256;
561 const size_t global_item_size = 256 * nb;
562 const size_t local_item_size = 256;
563
566 0, NULL, NULL));
568
569}
570
576void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
578 cl_int err;
579
580 if (math_program == NULL)
582
583 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
584 CL_CHECK(err);
585
586 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
587 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
588 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
589 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
590
591 const int nb = ((*n) + 256 - 1) / 256;
592 const size_t global_item_size = 256 * nb;
593 const size_t local_item_size = 256;
594
597 0, NULL, NULL));
599}
600
605void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
607 cl_int err;
608
609 if (math_program == NULL)
611
612 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
613 CL_CHECK(err);
614
615 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
616 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
617 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
618 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
619 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
620 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
621
622 const int nb = ((*n) + 256 - 1) / 256;
623 const size_t global_item_size = 256 * nb;
624 const size_t local_item_size = 256;
625
628 0, NULL, NULL));
630}
631
636void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
637 real *c3, int *n, cl_command_queue cmd_queue) {
638 cl_int err;
639
640 if (math_program == NULL)
642
643 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
644 CL_CHECK(err);
645
646 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
647 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
648 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
649 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
650 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
651 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
652 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
653 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
654
655 const int nb = ((*n) + 256 - 1) / 256;
656 const size_t global_item_size = 256 * nb;
657 const size_t local_item_size = 256;
658
661 0, NULL, NULL));
663}
664
669void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
670 real *c2, real *c3, real * c4, int *n,
672 cl_int err;
673
674 if (math_program == NULL)
676
677 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
678 CL_CHECK(err);
679
680 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
681 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
682 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
683 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
684 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
685 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
686 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
687 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
688 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
689 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
690
691 const int nb = ((*n) + 256 - 1) / 256;
692 const size_t global_item_size = 256 * nb;
693 const size_t local_item_size = 256;
694
697 0, NULL, NULL));
699}
700
706 cl_int err;
707
708 if (math_program == NULL)
710
711 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
712 CL_CHECK(err);
713
714 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
715 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
716
717 const int nb = ((*n) + 256 - 1) / 256;
718 const size_t global_item_size = 256 * nb;
719 const size_t local_item_size = 256;
720
723 0, NULL, NULL));
725}
726
731void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
732 cl_int err;
733
734 if (math_program == NULL)
736
737 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
738 CL_CHECK(err);
739
740 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
741 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
742 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
743
744 const int nb = ((*n) + 256 - 1) / 256;
745 const size_t global_item_size = 256 * nb;
746 const size_t local_item_size = 256;
747
750 0, NULL, NULL));
752}
753
758void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
759 cl_int err;
760
761 if (math_program == NULL)
763
764 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
765 CL_CHECK(err);
766
767 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
768 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
769 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
770
771 const int nb = ((*n) + 256 - 1) / 256;
772 const size_t global_item_size = 256 * nb;
773 const size_t local_item_size = 256;
774
777 0, NULL, NULL));
779}
780
785void opencl_col3(void *a, void *b, void *c, int *n,
787 cl_int err;
788
789 if (math_program == NULL)
791
792 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
793 CL_CHECK(err);
794
795 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
796 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
797 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
798 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
799
800 const int nb = ((*n) + 256 - 1) / 256;
801 const size_t global_item_size = 256 * nb;
802 const size_t local_item_size = 256;
803
806 0, NULL, NULL));
808}
809
814void opencl_subcol3(void *a, void *b, void *c, int *n,
816 cl_int err;
817
818 if (math_program == NULL)
820
821 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
822 CL_CHECK(err);
823
824 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
825 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
826 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
827 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
828
829 const int nb = ((*n) + 256 - 1) / 256;
830 const size_t global_item_size = 256 * nb;
831 const size_t local_item_size = 256;
832
835 0, NULL, NULL));
837}
838
843void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
844 cl_int err;
845
846 if (math_program == NULL)
848
849 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
850 CL_CHECK(err);
851
852 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
853 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
854 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
855
856 const int nb = ((*n) + 256 - 1) / 256;
857 const size_t global_item_size = 256 * nb;
858 const size_t local_item_size = 256;
859
862 0, NULL, NULL));
864}
865
870void opencl_sub3(void *a, void *b, void *c, int *n,
872 cl_int err;
873
874 if (math_program == NULL)
876
877 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
878 CL_CHECK(err);
879
880 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
881 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
882 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
883 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
884
885 const int nb = ((*n) + 256 - 1) / 256;
886 const size_t global_item_size = 256 * nb;
887 const size_t local_item_size = 256;
888
891 0, NULL, NULL));
893}
894
899void opencl_addcol3(void *a, void *b, void *c, int *n,
901 cl_int err;
902
903 if (math_program == NULL)
905
906 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
907 CL_CHECK(err);
908
909 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
910 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
911 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
912 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
913
914 const int nb = ((*n) + 256 - 1) / 256;
915 const size_t global_item_size = 256 * nb;
916 const size_t local_item_size = 256;
917
920 0, NULL, NULL));
922}
923
928void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
930 cl_int err;
931
932 if (math_program == NULL)
934
935 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
936 CL_CHECK(err);
937
938 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
939 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
940 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
941 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
942 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
943
944 const int nb = ((*n) + 256 - 1) / 256;
945 const size_t global_item_size = 256 * nb;
946 const size_t local_item_size = 256;
947
950 0, NULL, NULL));
952}
953
958void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
960 cl_int err;
961
962 if (math_program == NULL)
964
965 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
966 CL_CHECK(err);
967
968 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
969 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
970 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
971 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
972 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
973
974 const int nb = ((*n) + 256 - 1) / 256;
975 const size_t global_item_size = 256 * nb;
976 const size_t local_item_size = 256;
977
980 0, NULL, NULL));
982}
983
989void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
990 void *v1, void *v2, void *v3, int *n,
992 cl_int err;
993
994 if (math_program == NULL)
996
997 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
998 CL_CHECK(err);
999
1000 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
1001 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
1002 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
1003 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
1004 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
1005 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
1006 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
1007 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
1008
1009 const int nb = ((*n) + 256 - 1) / 256;
1010 const size_t global_item_size = 256 * nb;
1011 const size_t local_item_size = 256;
1012
1015 0, NULL, NULL));
1017}
1018
1024void opencl_vcross(void *u1, void *u2, void *u3,
1025 void *v1, void *v2, void *v3,
1026 void *w1, void *w2, void *w3,
1027 int *n, cl_command_queue cmd_queue) {
1028
1029 cl_int err;
1030
1031 if (math_program == NULL)
1033
1034 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1035 CL_CHECK(err);
1036
1037 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1038 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1039 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1040 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1041 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1042 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1043 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1044 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1045 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1046 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1047
1048 const int nb = ((*n) + 256 - 1) / 256;
1049 const size_t global_item_size = 256 * nb;
1050 const size_t local_item_size = 256;
1051
1054 0, NULL, NULL));
1056
1057}
1058
1060int red_s = 0;
1063
1068real opencl_glsc3(void *a, void *b, void *c, int *n,
1070 cl_int err;
1072 int i;
1073
1074 if (math_program == NULL)
1076
1077 const int nb = ((*n) + 256 - 1) / 256;
1078 const size_t global_item_size = 256 * nb;
1079 const size_t local_item_size = 256;
1080
1081 if ( nb > red_s){
1082 red_s = nb;
1083 if (bufred != NULL) {
1084 free(bufred);
1086 }
1087 bufred = (real *) malloc(nb * sizeof(real));
1088
1090 nb * sizeof(real), NULL, &err);
1091 CL_CHECK(err);
1092 }
1093
1094 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1095 CL_CHECK(err);
1096
1097 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1098 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1099 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1100 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1101 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1102
1105 0, NULL, &kern_wait));
1106
1108 nb * sizeof(real), bufred, 1,
1109 &kern_wait, NULL));
1110
1111 real res = 0.0;
1112 for (i = 0; i < nb; i++) {
1113 res += bufred[i];
1114 }
1115
1117
1118 return res;
1119}
1120
1125void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1127 int i, k;
1128 cl_int err;
1130
1131 if (math_program == NULL)
1133
1134 int pow2 = 1;
1135 while(pow2 < (*j)){
1136 pow2 = 2*pow2;
1137 }
1138
1139 const int nt = 256 / pow2;
1140 const int nb = ((*n) + nt - 1) / nt;
1141 const size_t local_item_size[2] = {nt, pow2};
1142 const size_t global_item_size[2] = {nb * nt, pow2};
1143
1144 if((*j)*nb > red_s) {
1145 red_s = (*j)*nb;
1146 if (bufred != NULL) {
1147 free(bufred);
1149 }
1150 bufred = (real *) malloc((*j) * nb * sizeof(real));
1151
1153 (*j) * nb * sizeof(real), NULL, &err);
1154 CL_CHECK(err);
1155 }
1156
1157 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1158 CL_CHECK(err);
1159
1160 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1161 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1162 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1163 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1164 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1165 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1166
1169 0, NULL, &kern_wait));
1170
1172 (*j) * nb * sizeof(real),
1173 bufred, 1, &kern_wait, NULL));
1174
1175 for (k = 0; k < (*j); k++) {
1176 h[k] = 0.0;
1177 }
1178
1179 for (i = 0; i < nb; i++) {
1180 for (k = 0; k < (*j); k++) {
1181 h[k] += bufred[i*(*j)+k];
1182 }
1183 }
1184
1186}
1187
1193 cl_int err;
1195 int i;
1196
1197 if (math_program == NULL)
1199
1200 const int nb = ((*n) + 256 - 1) / 256;
1201 const size_t global_item_size = 256 * nb;
1202 const size_t local_item_size = 256;
1203
1204 real * buf = (real *) malloc(nb * sizeof(real));
1205
1206 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1207 CL_CHECK(err);
1208
1210 nb * sizeof(real), NULL, &err);
1211 CL_CHECK(err);
1212
1213 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1214 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1215 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1216 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1217
1220 0, NULL, &kern_wait));
1221
1223 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1224
1225 real res = 0.0;
1226 for (i = 0; i < nb; i++) {
1227 res += buf[i];
1228 }
1229
1230 free(buf);
1233
1234 return res;
1235}
1236
1242 cl_int err;
1244 int i;
1245
1246 if (math_program == NULL)
1248
1249 const int nb = ((*n) + 256 - 1) / 256;
1250 const size_t global_item_size = 256 * nb;
1251 const size_t local_item_size = 256;
1252
1253 real * buf = (real *) malloc(nb * sizeof(real));
1254
1255 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1256 CL_CHECK(err);
1257
1259 nb * sizeof(real), NULL, &err);
1260 CL_CHECK(err);
1261
1262 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1263 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1264 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1265 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1266
1269 0, NULL, &kern_wait));
1270
1272 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1273
1274 real res = 0.0;
1275 for (i = 0; i < nb; i++) {
1276 res += buf[i];
1277 }
1278
1279 free(buf);
1282
1283 return res;
1284}
1285
1291 cl_int err;
1293 int i;
1294
1295 if (math_program == NULL)
1297
1298 const int nb = ((*n) + 256 - 1) / 256;
1299 const size_t global_item_size = 256 * nb;
1300 const size_t local_item_size = 256;
1301
1302 real * buf = (real *) malloc(nb * sizeof(real));
1303
1304 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1305 CL_CHECK(err);
1306
1308 nb * sizeof(real), NULL, &err);
1309 CL_CHECK(err);
1310
1311 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1312 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1313 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1314
1317 0, NULL, &kern_wait));
1318
1320 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1321
1322 real res = 0.0;
1323 for (i = 0; i < nb; i++) {
1324 res += buf[i];
1325 }
1326
1327 free(buf);
1330
1331 return res;
1332}
1333
1334
1339 cl_int err;
1340
1341 if (math_program == NULL)
1343
1344 cl_kernel kernel = clCreateKernel(math_program, "absval_kernel", &err);
1345 CL_CHECK(err);
1346
1347 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1348 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
1349
1350 const int nb = ((*n) + 256 - 1) / 256;
1351 const size_t global_item_size = 256 * nb;
1352 const size_t local_item_size = 256;
1353
1356 0, NULL, NULL));
1357}
1358
1362void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1363 cl_int err;
1364
1365 if (math_program == NULL)
1367
1368 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1369 CL_CHECK(err);
1370
1371 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1372 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1373 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1374
1375 const int nb = ((*n) + 256 - 1) / 256;
1376 const size_t global_item_size = 256 * nb;
1377 const size_t local_item_size = 256;
1378
1381 0, NULL, NULL));
1383}
1384
1389void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1390 cl_int err;
1391
1392 if (math_program == NULL)
1394
1395 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec2_kernel", &err);
1396 CL_CHECK(err);
1397
1398 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1399 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1400 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1401
1402 const int nb = ((*n) + 256 - 1) / 256;
1403 const size_t global_item_size = 256 * nb;
1404 const size_t local_item_size = 256;
1405
1408 0, NULL, NULL));
1410}
1411
1416void opencl_pwmax_vec3(void *a, void *b, void *c,
1417 int *n, cl_command_queue cmd_queue) {
1418 cl_int err;
1419
1420 if (math_program == NULL)
1422
1423 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec3_kernel", &err);
1424 CL_CHECK(err);
1425
1426 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1427 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1428 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1429 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1430
1431 const int nb = ((*n) + 256 - 1) / 256;
1432 const size_t global_item_size = 256 * nb;
1433 const size_t local_item_size = 256;
1434
1437 0, NULL, NULL));
1439}
1440
1446 cl_int err;
1447
1448 if (math_program == NULL)
1450
1451 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca2_kernel", &err);
1452 CL_CHECK(err);
1453
1454 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1455 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1456 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1457
1458 const int nb = ((*n) + 256 - 1) / 256;
1459 const size_t global_item_size = 256 * nb;
1460 const size_t local_item_size = 256;
1461
1464 0, NULL, NULL));
1466}
1467
1472void opencl_pwmax_sca3(void *a, void *b, real *c,
1473 int *n, cl_command_queue cmd_queue) {
1474 cl_int err;
1475
1476 if (math_program == NULL)
1478
1479 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca3_kernel", &err);
1480 CL_CHECK(err);
1481
1482 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1483 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1484 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1485 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1486
1487 const int nb = ((*n) + 256 - 1) / 256;
1488 const size_t global_item_size = 256 * nb;
1489 const size_t local_item_size = 256;
1490
1493 0, NULL, NULL));
1495}
1496
1501void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1502 cl_int err;
1503
1504 if (math_program == NULL)
1506
1507 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec2_kernel", &err);
1508 CL_CHECK(err);
1509
1510 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1511 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1512 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1513
1514 const int nb = ((*n) + 256 - 1) / 256;
1515 const size_t global_item_size = 256 * nb;
1516 const size_t local_item_size = 256;
1517
1520 0, NULL, NULL));
1522}
1523
1528void opencl_pwmin_vec3(void *a, void *b, void *c,
1529 int *n, cl_command_queue cmd_queue) {
1530 cl_int err;
1531
1532 if (math_program == NULL)
1534
1535 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec3_kernel", &err);
1536 CL_CHECK(err);
1537
1538 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1539 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1540 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1541 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1542
1543 const int nb = ((*n) + 256 - 1) / 256;
1544 const size_t global_item_size = 256 * nb;
1545 const size_t local_item_size = 256;
1546
1549 0, NULL, NULL));
1551}
1552
1558 cl_int err;
1559
1560 if (math_program == NULL)
1562
1563 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca2_kernel", &err);
1564 CL_CHECK(err);
1565
1566 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1567 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1568 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1569
1570 const int nb = ((*n) + 256 - 1) / 256;
1571 const size_t global_item_size = 256 * nb;
1572 const size_t local_item_size = 256;
1573
1576 0, NULL, NULL));
1578}
1579
1584void opencl_pwmin_sca3(void *a, void *b, real *c,
1585 int *n, cl_command_queue cmd_queue) {
1586 cl_int err;
1587
1588 if (math_program == NULL)
1590
1591 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca3_kernel", &err);
1592 CL_CHECK(err);
1593
1594 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1595 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1596 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1597 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1598
1599 const int nb = ((*n) + 256 - 1) / 256;
1600 const size_t global_item_size = 256 * nb;
1601 const size_t local_item_size = 256;
1602
1605 0, NULL, NULL));
1607}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1362
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:785
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:261
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:123
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1445
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:1024
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:92
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:843
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:758
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:544
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:870
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:483
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:958
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:705
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:423
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1068
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:195
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:669
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:636
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:207
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:154
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:341
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1584
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1416
real * bufred
Definition math.c:1061
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1290
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:62
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:452
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1528
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:315
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:513
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:989
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1501
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:605
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1125
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1241
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:576
int red_s
Definition math.c:1060
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1338
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:899
cl_mem bufred_d
Definition math.c:1062
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1389
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:183
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:53
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:814
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:396
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:928
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1557
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1472
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:731
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1192
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:287
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:369
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:233
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program