Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <math.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48
49#include "math_kernel.cl.h"
50
54void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
56 b, a, 0, 0, (*n) * sizeof(real),
57 0, NULL, NULL));
58}
59
63void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
65 cl_int err;
66
67 if (math_program == NULL)
69
70 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
72
73 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
74 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
75 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
76 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
77 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
78
79 const int nb = ((*n) + 256 - 1) / 256;
80 const size_t global_item_size = 256 * nb;
81 const size_t local_item_size = 256;
82
85 0, NULL, NULL));
87
88}
89
93void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
95 cl_int err;
96
97 if (math_program == NULL)
99
100 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
101 &err);
102 CL_CHECK(err);
103
104 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
105 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
106 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
107 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
108 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
109
110 const int nb = ((*n) + 256 - 1) / 256;
111 const size_t global_item_size = 256 * nb;
112 const size_t local_item_size = 256;
113
116 0, NULL, NULL));
118
119}
120
124void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n,
125 int *m, cl_command_queue cmd_queue) {
126 cl_int err;
127
128 if (math_program == NULL)
130
132 "masked_gather_copy_aligned_kernel", &err);
133 CL_CHECK(err);
134
135 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
136 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
137 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
138 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
139 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
140
141 const int nb = ((*n) + 256 - 1) / 256;
142 const size_t global_item_size = 256 * nb;
143 const size_t local_item_size = 256;
144
147 0, NULL, NULL));
149
150}
151
155void opencl_face_masked_gather_copy(void *a, void *b, void *mask, void *facet,
156 int *n1, int *n2, int *lx, int *ly,
157 int *lz, int *m,
159 cl_int err;
160
161 if (math_program == NULL)
163
165 "face_masked_gather_copy_kernel", &err);
166 CL_CHECK(err);
167
168 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
169 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
170 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
171 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &facet));
172 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n1));
173 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n2));
174 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), lx));
175 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), ly));
176 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), lz));
177 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), m));
178
179 const int nb = ((*m) + 256 - 1) / 256;
180 const size_t global_item_size = 256 * nb;
181 const size_t local_item_size = 256;
182
185 0, NULL, NULL));
187
188}
189
193void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
195 cl_int err;
196
197 if (math_program == NULL)
199
200 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
201 &err);
202 CL_CHECK(err);
203
204 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
205 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
206 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
207 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
208 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
209
210 const int nb = ((*n) + 256 - 1) / 256;
211 const size_t global_item_size = 256 * nb;
212 const size_t local_item_size = 256;
213
216 0, NULL, NULL));
218
219}
220
224void opencl_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m,
226 cl_int err;
227
228 if (math_program == NULL)
230
231 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_aligned_kernel",
232 &err);
233 CL_CHECK(err);
234
235 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
236 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
237 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
238 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
239 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
240
241 const int nb = ((*n) + 256 - 1) / 256;
242 const size_t global_item_size = 256 * nb;
243 const size_t local_item_size = 256;
244
247 0, NULL, NULL));
249
250}
251
255void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
257 cl_int err;
258
259 if (math_program == NULL)
261
262 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
263 CL_CHECK(err);
264
265 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
266 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
267 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
268 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
269 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
270
271 const int nb = ((*mask_size) + 256 - 1) / 256;
272 const size_t global_item_size = 256 * nb;
273 const size_t local_item_size = 256;
274
277 0, NULL, NULL));
279}
280
286 real zero = 0.0;
287
289 (*n) * sizeof(real), 0, NULL, &wait_kern));
291}
292
298 real one = 1.0;
299
301 (*n) * sizeof(real), 0, NULL, &wait_kern));
303}
304
308void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
309 cl_int err;
310
311 if (math_program == NULL)
313
314 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
315 CL_CHECK(err);
316
317 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
318 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
319 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
320
321 const int nb = ((*n) + 256 - 1) / 256;
322 const size_t global_item_size = 256 * nb;
323 const size_t local_item_size = 256;
324
327 0, NULL, NULL));
329}
330
334void opencl_cmult2(void *a, void *b, real *c, int *n,
336 cl_int err;
337
338 if (math_program == NULL)
340
341 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
342 CL_CHECK(err);
343
344 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
345 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
346 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
347 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
348
349 const int nb = ((*n) + 256 - 1) / 256;
350 const size_t global_item_size = 256 * nb;
351 const size_t local_item_size = 256;
352
355 0, NULL, NULL));
357}
358
362void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
363 cl_int err;
364
365 if (math_program == NULL)
367
368 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
369 CL_CHECK(err);
370
371 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
372 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
373 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
374
375 const int nb = ((*n) + 256 - 1) / 256;
376 const size_t global_item_size = 256 * nb;
377 const size_t local_item_size = 256;
378
381 0, NULL, NULL));
383}
384
388void opencl_cdiv2(void *a, void *b, real *c, int *n,
390 cl_int err;
391
392 if (math_program == NULL)
394
395 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
396 CL_CHECK(err);
397
398 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
399 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
400 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
401 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
402
403 const int nb = ((*n) + 256 - 1) / 256;
404 const size_t global_item_size = 256 * nb;
405 const size_t local_item_size = 256;
406
409 0, NULL, NULL));
411}
412
416void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
417 cl_int err;
418
419 if (math_program == NULL)
421
422 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
423 CL_CHECK(err);
424
425 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
426 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
427 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
428
429 const int nb = ((*n) + 256 - 1) / 256;
430 const size_t global_item_size = 256 * nb;
431 const size_t local_item_size = 256;
432
435 0, NULL, NULL));
437}
438
442void opencl_cadd2(void *a, void *b, real *c, int *n,
444 cl_int err;
445
446 if (math_program == NULL)
448
449 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
450 CL_CHECK(err);
451
452 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
453 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
454 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
455 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
456
457 const int nb = ((*n) + 256 - 1) / 256;
458 const size_t global_item_size = 256 * nb;
459 const size_t local_item_size = 256;
460
463 0, NULL, NULL));
465}
466
470void opencl_cwrap(void *a, real *min_val, real *max_val, int *n,
472 cl_int err;
473
474 if (math_program == NULL)
476
477 cl_kernel kernel = clCreateKernel(math_program, "cwrap_kernel", &err);
478 CL_CHECK(err);
479
480 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
483 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
484
485 const int nb = ((*n) + 256 - 1) / 256;
486 const size_t global_item_size = 256 * nb;
487 const size_t local_item_size = 256;
488
491 0, NULL, NULL));
493}
494
498void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
499 cl_int err;
500
501 if (math_program == NULL)
503
504 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
505 CL_CHECK(err);
506
507 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
508 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
509 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
510
511 const int nb = ((*n) + 256 - 1) / 256;
512 const size_t global_item_size = 256 * nb;
513 const size_t local_item_size = 256;
514
517 0, NULL, NULL));
519}
520
525void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
526 cl_int err;
527
528 if (math_program == NULL)
530
531 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
532 CL_CHECK(err);
533
534 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
535 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
536 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
537
538 const int nb = ((*n) + 256 - 1) / 256;
539 const size_t global_item_size = 256 * nb;
540 const size_t local_item_size = 256;
541
544 0, NULL, NULL));
546}
547
552void opencl_add3(void *a, void *b, void *c, int *n,
554 cl_int err;
555
556 if (math_program == NULL)
558
559 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
560 CL_CHECK(err);
561
562 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
563 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
564 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
565 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
566
567 const int nb = ((*n) + 256 - 1) / 256;
568 const size_t global_item_size = 256 * nb;
569 const size_t local_item_size = 256;
570
573 0, NULL, NULL));
575}
576
581void opencl_add4(void *a, void *b, void *c, void *d, int *n,
583 cl_int err;
584
585 if (math_program == NULL)
587
588 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
589 CL_CHECK(err);
590
591 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
592 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
593 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
594 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
595 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
596
597 const int nb = ((*n) + 256 - 1) / 256;
598 const size_t global_item_size = 256 * nb;
599 const size_t local_item_size = 256;
600
603 0, NULL, NULL));
605}
606
612void opencl_add2s1(void *a, void *b, real *c1, int *n,
614 cl_int err;
615
616 if (math_program == NULL)
618
619 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
620 CL_CHECK(err);
621
622 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
623 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
624 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
625 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
626
627 const int nb = ((*n) + 256 - 1) / 256;
628 const size_t global_item_size = 256 * nb;
629 const size_t local_item_size = 256;
630
633 0, NULL, NULL));
635}
636
642void opencl_add2s2(void *a, void *b, real *c1, int *n,
644 cl_int err;
645
646 if (math_program == NULL)
648
649 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
650 CL_CHECK(err);
651
652 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
653 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
654 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
655 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
656
657 const int nb = ((*n) + 256 - 1) / 256;
658 const size_t global_item_size = 256 * nb;
659 const size_t local_item_size = 256;
660
663 0, NULL, NULL));
665}
666
673void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
675 cl_int err;
676
677 if (math_program == NULL)
679
680 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
681 CL_CHECK(err);
682
683 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
684 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
685 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
686 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
687 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
688
689 const int nb = ((*n) + 256 - 1) / 256;
690 const size_t global_item_size = 256 * nb;
691 const size_t local_item_size = 256;
692
695 0, NULL, NULL));
697
698}
699
705void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
707 cl_int err;
708
709 if (math_program == NULL)
711
712 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
713 CL_CHECK(err);
714
715 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
716 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
717 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
718 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
719
720 const int nb = ((*n) + 256 - 1) / 256;
721 const size_t global_item_size = 256 * nb;
722 const size_t local_item_size = 256;
723
726 0, NULL, NULL));
728}
729
734void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
736 cl_int err;
737
738 if (math_program == NULL)
740
741 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
742 CL_CHECK(err);
743
744 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
745 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
746 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
747 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
748 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
749 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
750
751 const int nb = ((*n) + 256 - 1) / 256;
752 const size_t global_item_size = 256 * nb;
753 const size_t local_item_size = 256;
754
757 0, NULL, NULL));
759}
760
765void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
766 real *c3, int *n, cl_command_queue cmd_queue) {
767 cl_int err;
768
769 if (math_program == NULL)
771
772 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
773 CL_CHECK(err);
774
775 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
776 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
777 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
778 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
779 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
780 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
781 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
782 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
783
784 const int nb = ((*n) + 256 - 1) / 256;
785 const size_t global_item_size = 256 * nb;
786 const size_t local_item_size = 256;
787
790 0, NULL, NULL));
792}
793
798void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
799 real *c2, real *c3, real * c4, int *n,
801 cl_int err;
802
803 if (math_program == NULL)
805
806 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
807 CL_CHECK(err);
808
809 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
810 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
811 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
812 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
813 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
814 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
815 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
816 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
817 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
818 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
819
820 const int nb = ((*n) + 256 - 1) / 256;
821 const size_t global_item_size = 256 * nb;
822 const size_t local_item_size = 256;
823
826 0, NULL, NULL));
828}
829
835 cl_int err;
836
837 if (math_program == NULL)
839
840 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
841 CL_CHECK(err);
842
843 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
844 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
845
846 const int nb = ((*n) + 256 - 1) / 256;
847 const size_t global_item_size = 256 * nb;
848 const size_t local_item_size = 256;
849
852 0, NULL, NULL));
854}
855
860void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
861 cl_int err;
862
863 if (math_program == NULL)
865
866 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
867 CL_CHECK(err);
868
869 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
870 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
871 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
872
873 const int nb = ((*n) + 256 - 1) / 256;
874 const size_t global_item_size = 256 * nb;
875 const size_t local_item_size = 256;
876
879 0, NULL, NULL));
881}
882
887void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
888 cl_int err;
889
890 if (math_program == NULL)
892
893 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
894 CL_CHECK(err);
895
896 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
897 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
898 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
899
900 const int nb = ((*n) + 256 - 1) / 256;
901 const size_t global_item_size = 256 * nb;
902 const size_t local_item_size = 256;
903
906 0, NULL, NULL));
908}
909
914void opencl_col3(void *a, void *b, void *c, int *n,
916 cl_int err;
917
918 if (math_program == NULL)
920
921 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
922 CL_CHECK(err);
923
924 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
925 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
926 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
927 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
928
929 const int nb = ((*n) + 256 - 1) / 256;
930 const size_t global_item_size = 256 * nb;
931 const size_t local_item_size = 256;
932
935 0, NULL, NULL));
937}
938
943void opencl_subcol3(void *a, void *b, void *c, int *n,
945 cl_int err;
946
947 if (math_program == NULL)
949
950 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
951 CL_CHECK(err);
952
953 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
954 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
955 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
956 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
957
958 const int nb = ((*n) + 256 - 1) / 256;
959 const size_t global_item_size = 256 * nb;
960 const size_t local_item_size = 256;
961
964 0, NULL, NULL));
966}
967
972void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
973 cl_int err;
974
975 if (math_program == NULL)
977
978 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
979 CL_CHECK(err);
980
981 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
982 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
983 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
984
985 const int nb = ((*n) + 256 - 1) / 256;
986 const size_t global_item_size = 256 * nb;
987 const size_t local_item_size = 256;
988
991 0, NULL, NULL));
993}
994
999void opencl_sub3(void *a, void *b, void *c, int *n,
1001 cl_int err;
1002
1003 if (math_program == NULL)
1005
1006 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
1007 CL_CHECK(err);
1008
1009 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1010 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1011 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1012 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1013
1014 const int nb = ((*n) + 256 - 1) / 256;
1015 const size_t global_item_size = 256 * nb;
1016 const size_t local_item_size = 256;
1017
1020 0, NULL, NULL));
1022}
1023
1028void opencl_addcol3(void *a, void *b, void *c, int *n,
1030 cl_int err;
1031
1032 if (math_program == NULL)
1034
1035 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
1036 CL_CHECK(err);
1037
1038 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1039 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1040 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1041 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1042
1043 const int nb = ((*n) + 256 - 1) / 256;
1044 const size_t global_item_size = 256 * nb;
1045 const size_t local_item_size = 256;
1046
1049 0, NULL, NULL));
1051}
1052
1057void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
1059 cl_int err;
1060
1061 if (math_program == NULL)
1063
1064 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
1065 CL_CHECK(err);
1066
1067 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1068 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1069 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1070 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
1071 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1072
1073 const int nb = ((*n) + 256 - 1) / 256;
1074 const size_t global_item_size = 256 * nb;
1075 const size_t local_item_size = 256;
1076
1079 0, NULL, NULL));
1081}
1082
1087void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
1089 cl_int err;
1090
1091 if (math_program == NULL)
1093
1094 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
1095 CL_CHECK(err);
1096
1097 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1098 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1099 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1100 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
1101 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1102
1103 const int nb = ((*n) + 256 - 1) / 256;
1104 const size_t global_item_size = 256 * nb;
1105 const size_t local_item_size = 256;
1106
1109 0, NULL, NULL));
1111}
1112
1118void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
1119 void *v1, void *v2, void *v3, int *n,
1121 cl_int err;
1122
1123 if (math_program == NULL)
1125
1126 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
1127 CL_CHECK(err);
1128
1129 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
1130 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
1131 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
1132 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
1133 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
1134 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
1135 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
1136 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
1137
1138 const int nb = ((*n) + 256 - 1) / 256;
1139 const size_t global_item_size = 256 * nb;
1140 const size_t local_item_size = 256;
1141
1144 0, NULL, NULL));
1146}
1147
1153void opencl_vcross(void *u1, void *u2, void *u3,
1154 void *v1, void *v2, void *v3,
1155 void *w1, void *w2, void *w3,
1156 int *n, cl_command_queue cmd_queue) {
1157
1158 cl_int err;
1159
1160 if (math_program == NULL)
1162
1163 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1164 CL_CHECK(err);
1165
1166 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1167 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1168 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1169 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1170 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1171 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1172 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1173 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1174 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1175 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1176
1177 const int nb = ((*n) + 256 - 1) / 256;
1178 const size_t global_item_size = 256 * nb;
1179 const size_t local_item_size = 256;
1180
1183 0, NULL, NULL));
1185
1186}
1187
1189int red_s = 0;
1192
1197real opencl_glsc3(void *a, void *b, void *c, int *n,
1199 cl_int err;
1201 int i;
1202
1203 if (math_program == NULL)
1205
1206 const int nb = ((*n) + 256 - 1) / 256;
1207 const size_t global_item_size = 256 * nb;
1208 const size_t local_item_size = 256;
1209
1210 if ( nb > red_s){
1211 red_s = nb;
1212 if (bufred != NULL) {
1213 free(bufred);
1215 }
1216 bufred = (real *) malloc(nb * sizeof(real));
1217
1219 nb * sizeof(real), NULL, &err);
1220 CL_CHECK(err);
1221 }
1222
1223 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1224 CL_CHECK(err);
1225
1226 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1227 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1228 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1229 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1230 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1231
1234 0, NULL, &kern_wait));
1235
1237 nb * sizeof(real), bufred, 1,
1238 &kern_wait, NULL));
1239
1240 real res = 0.0;
1241 for (i = 0; i < nb; i++) {
1242 res += bufred[i];
1243 }
1244
1246
1247 return res;
1248}
1249
1254void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1256 int i, k;
1257 cl_int err;
1259
1260 if (math_program == NULL)
1262
1263 int pow2 = 1;
1264 while(pow2 < (*j)){
1265 pow2 = 2*pow2;
1266 }
1267
1268 const int nt = 256 / pow2;
1269 const int nb = ((*n) + nt - 1) / nt;
1270 const size_t local_item_size[2] = {nt, pow2};
1271 const size_t global_item_size[2] = {nb * nt, pow2};
1272
1273 if((*j)*nb > red_s) {
1274 red_s = (*j)*nb;
1275 if (bufred != NULL) {
1276 free(bufred);
1278 }
1279 bufred = (real *) malloc((*j) * nb * sizeof(real));
1280
1282 (*j) * nb * sizeof(real), NULL, &err);
1283 CL_CHECK(err);
1284 }
1285
1286 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1287 CL_CHECK(err);
1288
1289 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1290 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1291 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1292 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1293 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1294 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1295
1298 0, NULL, &kern_wait));
1299
1301 (*j) * nb * sizeof(real),
1302 bufred, 1, &kern_wait, NULL));
1303
1304 for (k = 0; k < (*j); k++) {
1305 h[k] = 0.0;
1306 }
1307
1308 for (i = 0; i < nb; i++) {
1309 for (k = 0; k < (*j); k++) {
1310 h[k] += bufred[i*(*j)+k];
1311 }
1312 }
1313
1315}
1316
1322 cl_int err;
1324 int i;
1325
1326 if (math_program == NULL)
1328
1329 const int nb = ((*n) + 256 - 1) / 256;
1330 const size_t global_item_size = 256 * nb;
1331 const size_t local_item_size = 256;
1332
1333 real * buf = (real *) malloc(nb * sizeof(real));
1334
1335 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1336 CL_CHECK(err);
1337
1339 nb * sizeof(real), NULL, &err);
1340 CL_CHECK(err);
1341
1342 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1343 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1344 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1345 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1346
1349 0, NULL, &kern_wait));
1350
1352 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1353
1354 real res = 0.0;
1355 for (i = 0; i < nb; i++) {
1356 res += buf[i];
1357 }
1358
1359 free(buf);
1363
1364 return res;
1365}
1366
1372 cl_int err;
1374 int i;
1375
1376 if (math_program == NULL)
1378
1379 const int nb = ((*n) + 256 - 1) / 256;
1380 const size_t global_item_size = 256 * nb;
1381 const size_t local_item_size = 256;
1382
1383 real * buf = (real *) malloc(nb * sizeof(real));
1384
1385 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1386 CL_CHECK(err);
1387
1389 nb * sizeof(real), NULL, &err);
1390 CL_CHECK(err);
1391
1392 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1393 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1394 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1395 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1396
1399 0, NULL, &kern_wait));
1400
1402 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1403
1404 real res = 0.0;
1405 for (i = 0; i < nb; i++) {
1406 res += buf[i];
1407 }
1408
1409 free(buf);
1413
1414 return res;
1415}
1416
1422 cl_int err;
1424 int i;
1425
1426 if (math_program == NULL)
1428
1429 const int nb = ((*n) + 256 - 1) / 256;
1430 const size_t global_item_size = 256 * nb;
1431 const size_t local_item_size = 256;
1432
1433 real * buf = (real *) malloc(nb * sizeof(real));
1434
1435 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1436 CL_CHECK(err);
1437
1439 nb * sizeof(real), NULL, &err);
1440 CL_CHECK(err);
1441
1442 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1443 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1444 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1445
1448 0, NULL, &kern_wait));
1449
1451 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1452
1453 real res = 0.0;
1454 for (i = 0; i < nb; i++) {
1455 res += buf[i];
1456 }
1457
1458 free(buf);
1462
1463 return res;
1464}
1465
1467 cl_int err;
1469 int i;
1470
1471 if (*n <= 0) {
1472 return -((real) HUGE_VAL);
1473 }
1474
1475 if (math_program == NULL)
1477
1478 const int nb = ((*n) + 256 - 1) / 256;
1479 const size_t global_item_size = 256 * nb;
1480 const size_t local_item_size = 256;
1481
1482 real * buf = (real *) malloc(nb * sizeof(real));
1483
1484 cl_kernel kernel = clCreateKernel(math_program, "glmax_kernel", &err);
1485 CL_CHECK(err);
1486
1488 nb * sizeof(real), NULL, &err);
1489 CL_CHECK(err);
1490
1491 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1492 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1493 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1494
1497 0, NULL, &kern_wait));
1498
1500 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1501
1502 real res = buf[0];
1503 for (i = 1; i < nb; i++) {
1504 res = fmax(res, buf[i]);
1505 }
1506
1507 free(buf);
1511
1512 return res;
1513}
1514
1516 cl_int err;
1518 int i;
1519
1520 if (*n <= 0) {
1521 return (real) HUGE_VAL;
1522 }
1523
1524 if (math_program == NULL)
1526
1527 const int nb = ((*n) + 256 - 1) / 256;
1528 const size_t global_item_size = 256 * nb;
1529 const size_t local_item_size = 256;
1530
1531 real * buf = (real *) malloc(nb * sizeof(real));
1532
1533 cl_kernel kernel = clCreateKernel(math_program, "glmin_kernel", &err);
1534 CL_CHECK(err);
1535
1537 nb * sizeof(real), NULL, &err);
1538 CL_CHECK(err);
1539
1540 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1541 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1542 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1543
1546 0, NULL, &kern_wait));
1547
1549 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1550
1551 real res = buf[0];
1552 for (i = 1; i < nb; i++) {
1553 res = fmin(res, buf[i]);
1554 }
1555
1556 free(buf);
1560
1561 return res;
1562}
1563
1564
1569 cl_int err;
1570
1571 if (math_program == NULL)
1573
1574 cl_kernel kernel = clCreateKernel(math_program, "absval_kernel", &err);
1575 CL_CHECK(err);
1576
1577 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1578 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
1579
1580 const int nb = ((*n) + 256 - 1) / 256;
1581 const size_t global_item_size = 256 * nb;
1582 const size_t local_item_size = 256;
1583
1586 0, NULL, NULL));
1587}
1588
1592void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1593 cl_int err;
1594
1595 if (math_program == NULL)
1597
1598 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1599 CL_CHECK(err);
1600
1601 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1602 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1603 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1604
1605 const int nb = ((*n) + 256 - 1) / 256;
1606 const size_t global_item_size = 256 * nb;
1607 const size_t local_item_size = 256;
1608
1611 0, NULL, NULL));
1613}
1614
1619void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1620 cl_int err;
1621
1622 if (math_program == NULL)
1624
1625 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec2_kernel", &err);
1626 CL_CHECK(err);
1627
1628 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1629 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1630 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1631
1632 const int nb = ((*n) + 256 - 1) / 256;
1633 const size_t global_item_size = 256 * nb;
1634 const size_t local_item_size = 256;
1635
1638 0, NULL, NULL));
1640}
1641
1646void opencl_pwmax_vec3(void *a, void *b, void *c,
1647 int *n, cl_command_queue cmd_queue) {
1648 cl_int err;
1649
1650 if (math_program == NULL)
1652
1653 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec3_kernel", &err);
1654 CL_CHECK(err);
1655
1656 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1657 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1658 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1659 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1660
1661 const int nb = ((*n) + 256 - 1) / 256;
1662 const size_t global_item_size = 256 * nb;
1663 const size_t local_item_size = 256;
1664
1667 0, NULL, NULL));
1669}
1670
1676 cl_int err;
1677
1678 if (math_program == NULL)
1680
1681 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca2_kernel", &err);
1682 CL_CHECK(err);
1683
1684 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1685 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1686 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1687
1688 const int nb = ((*n) + 256 - 1) / 256;
1689 const size_t global_item_size = 256 * nb;
1690 const size_t local_item_size = 256;
1691
1694 0, NULL, NULL));
1696}
1697
1702void opencl_pwmax_sca3(void *a, void *b, real *c,
1703 int *n, cl_command_queue cmd_queue) {
1704 cl_int err;
1705
1706 if (math_program == NULL)
1708
1709 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca3_kernel", &err);
1710 CL_CHECK(err);
1711
1712 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1713 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1714 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1715 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1716
1717 const int nb = ((*n) + 256 - 1) / 256;
1718 const size_t global_item_size = 256 * nb;
1719 const size_t local_item_size = 256;
1720
1723 0, NULL, NULL));
1725}
1726
1731void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1732 cl_int err;
1733
1734 if (math_program == NULL)
1736
1737 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec2_kernel", &err);
1738 CL_CHECK(err);
1739
1740 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1741 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1742 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1743
1744 const int nb = ((*n) + 256 - 1) / 256;
1745 const size_t global_item_size = 256 * nb;
1746 const size_t local_item_size = 256;
1747
1750 0, NULL, NULL));
1752}
1753
1758void opencl_pwmin_vec3(void *a, void *b, void *c,
1759 int *n, cl_command_queue cmd_queue) {
1760 cl_int err;
1761
1762 if (math_program == NULL)
1764
1765 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec3_kernel", &err);
1766 CL_CHECK(err);
1767
1768 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1769 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1770 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1771 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1772
1773 const int nb = ((*n) + 256 - 1) / 256;
1774 const size_t global_item_size = 256 * nb;
1775 const size_t local_item_size = 256;
1776
1779 0, NULL, NULL));
1781}
1782
1788 cl_int err;
1789
1790 if (math_program == NULL)
1792
1793 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca2_kernel", &err);
1794 CL_CHECK(err);
1795
1796 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1797 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1798 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1799
1800 const int nb = ((*n) + 256 - 1) / 256;
1801 const size_t global_item_size = 256 * nb;
1802 const size_t local_item_size = 256;
1803
1806 0, NULL, NULL));
1808}
1809
1814void opencl_pwmin_sca3(void *a, void *b, real *c,
1815 int *n, cl_command_queue cmd_queue) {
1816 cl_int err;
1817
1818 if (math_program == NULL)
1820
1821 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca3_kernel", &err);
1822 CL_CHECK(err);
1823
1824 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1825 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1826 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1827 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1828
1829 const int nb = ((*n) + 256 - 1) / 256;
1830 const size_t global_item_size = 256 * nb;
1831 const size_t local_item_size = 256;
1832
1835 0, NULL, NULL));
1837}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1592
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:914
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:362
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:193
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1675
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:1153
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:93
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:972
void opencl_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cl_command_queue cmd_queue)
Definition math.c:155
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:887
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:673
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:999
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:612
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:1087
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:834
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:552
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1197
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:296
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:798
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:765
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:308
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:255
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:442
void opencl_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:224
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1814
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1646
real * bufred
Definition math.c:1190
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1421
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:63
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:581
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1758
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:416
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:642
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:1118
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1731
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:734
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1254
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1371
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:705
int red_s
Definition math.c:1189
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1568
void opencl_cwrap(void *a, real *min_val, real *max_val, int *n, cl_command_queue cmd_queue)
Definition math.c:470
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1028
cl_mem bufred_d
Definition math.c:1191
real opencl_glmin(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1515
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1619
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:284
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:54
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:943
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:525
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:1057
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1787
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1702
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:860
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1321
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:388
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:498
void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:124
real opencl_glmax(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1466
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:334
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program