Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <math.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48
49#include "math_kernel.cl.h"
50
54void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
56 b, a, 0, 0, (*n) * sizeof(real),
57 0, NULL, NULL));
58}
59
64void opencl_masked_copy_0(void *a, void *b, void *mask, int *n, int *m,
66 cl_int err;
67
68 if (math_program == NULL)
70
71 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel_0", &err);
73
74 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
75 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
76 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
77 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
78 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
79
80 const int nb = ((*m) + 256 - 1) / 256;
81 const size_t global_item_size = 256 * nb;
82 const size_t local_item_size = 256;
83
86 0, NULL, NULL));
88
89}
90
94void opencl_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m,
96 cl_int err;
97
98 if (math_program == NULL)
100
101 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel_aligned", &err);
102 CL_CHECK(err);
103
104 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
105 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
106 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
107 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
108 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
109
110 const int nb = ((*m) + 256 - 1) / 256;
111 const size_t global_item_size = 256 * nb;
112 const size_t local_item_size = 256;
113
116 0, NULL, NULL));
118
119}
120
124void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
126 cl_int err;
127
128 if (math_program == NULL)
130
131 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
132 &err);
133 CL_CHECK(err);
134
135 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
136 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
137 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
138 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
139 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
140
141 const int nb = ((*n) + 256 - 1) / 256;
142 const size_t global_item_size = 256 * nb;
143 const size_t local_item_size = 256;
144
147 0, NULL, NULL));
149
150}
151
155void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n,
156 int *m, cl_command_queue cmd_queue) {
157 cl_int err;
158
159 if (math_program == NULL)
161
163 "masked_gather_copy_aligned_kernel", &err);
164 CL_CHECK(err);
165
166 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
167 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
168 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
169 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
170 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
171
172 const int nb = ((*n) + 256 - 1) / 256;
173 const size_t global_item_size = 256 * nb;
174 const size_t local_item_size = 256;
175
178 0, NULL, NULL));
180
181}
182
186void opencl_face_masked_gather_copy(void *a, void *b, void *mask, void *facet,
187 int *n1, int *n2, int *lx, int *ly,
188 int *lz, int *m,
190 cl_int err;
191
192 if (math_program == NULL)
194
196 "face_masked_gather_copy_kernel", &err);
197 CL_CHECK(err);
198
199 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
200 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
201 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
202 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &facet));
203 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n1));
204 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n2));
205 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), lx));
206 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), ly));
207 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), lz));
208 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), m));
209
210 const int nb = ((*m) + 256 - 1) / 256;
211 const size_t global_item_size = 256 * nb;
212 const size_t local_item_size = 256;
213
216 0, NULL, NULL));
218
219}
220
224void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
226 cl_int err;
227
228 if (math_program == NULL)
230
231 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
232 &err);
233 CL_CHECK(err);
234
235 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
236 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
237 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
238 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
239 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
240
241 const int nb = ((*n) + 256 - 1) / 256;
242 const size_t global_item_size = 256 * nb;
243 const size_t local_item_size = 256;
244
247 0, NULL, NULL));
249
250}
251
255void opencl_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m,
257 cl_int err;
258
259 if (math_program == NULL)
261
262 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_aligned_kernel",
263 &err);
264 CL_CHECK(err);
265
266 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
267 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
268 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
269 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
270 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
271
272 const int nb = ((*n) + 256 - 1) / 256;
273 const size_t global_item_size = 256 * nb;
274 const size_t local_item_size = 256;
275
278 0, NULL, NULL));
280
281}
282
286void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
288 cl_int err;
289
290 if (math_program == NULL)
292
293 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
294 CL_CHECK(err);
295
296 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
297 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
298 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
299 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
300 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
301
302 const int nb = ((*mask_size) + 256 - 1) / 256;
303 const size_t global_item_size = 256 * nb;
304 const size_t local_item_size = 256;
305
308 0, NULL, NULL));
310}
311
317 real zero = 0.0;
318
320 (*n) * sizeof(real), 0, NULL, &wait_kern));
322}
323
329 real one = 1.0;
330
332 (*n) * sizeof(real), 0, NULL, &wait_kern));
334}
335
339void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
340 cl_int err;
341
342 if (math_program == NULL)
344
345 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
346 CL_CHECK(err);
347
348 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
349 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
350 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
351
352 const int nb = ((*n) + 256 - 1) / 256;
353 const size_t global_item_size = 256 * nb;
354 const size_t local_item_size = 256;
355
358 0, NULL, NULL));
360}
361
365void opencl_cmult2(void *a, void *b, real *c, int *n,
367 cl_int err;
368
369 if (math_program == NULL)
371
372 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
373 CL_CHECK(err);
374
375 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
376 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
377 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
378 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
379
380 const int nb = ((*n) + 256 - 1) / 256;
381 const size_t global_item_size = 256 * nb;
382 const size_t local_item_size = 256;
383
386 0, NULL, NULL));
388}
389
393void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
394 cl_int err;
395
396 if (math_program == NULL)
398
399 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
400 CL_CHECK(err);
401
402 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
403 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
404 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
405
406 const int nb = ((*n) + 256 - 1) / 256;
407 const size_t global_item_size = 256 * nb;
408 const size_t local_item_size = 256;
409
412 0, NULL, NULL));
414}
415
419void opencl_cdiv2(void *a, void *b, real *c, int *n,
421 cl_int err;
422
423 if (math_program == NULL)
425
426 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
427 CL_CHECK(err);
428
429 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
430 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
431 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
432 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
433
434 const int nb = ((*n) + 256 - 1) / 256;
435 const size_t global_item_size = 256 * nb;
436 const size_t local_item_size = 256;
437
440 0, NULL, NULL));
442}
443
447void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
448 cl_int err;
449
450 if (math_program == NULL)
452
453 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
454 CL_CHECK(err);
455
456 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
457 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
458 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
459
460 const int nb = ((*n) + 256 - 1) / 256;
461 const size_t global_item_size = 256 * nb;
462 const size_t local_item_size = 256;
463
466 0, NULL, NULL));
468}
469
473void opencl_cadd2(void *a, void *b, real *c, int *n,
475 cl_int err;
476
477 if (math_program == NULL)
479
480 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
481 CL_CHECK(err);
482
483 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
484 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
485 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
486 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
487
488 const int nb = ((*n) + 256 - 1) / 256;
489 const size_t global_item_size = 256 * nb;
490 const size_t local_item_size = 256;
491
494 0, NULL, NULL));
496}
497
501void opencl_cwrap(void *a, real *min_val, real *max_val, int *n,
503 cl_int err;
504
505 if (math_program == NULL)
507
508 cl_kernel kernel = clCreateKernel(math_program, "cwrap_kernel", &err);
509 CL_CHECK(err);
510
511 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
514 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
515
516 const int nb = ((*n) + 256 - 1) / 256;
517 const size_t global_item_size = 256 * nb;
518 const size_t local_item_size = 256;
519
522 0, NULL, NULL));
524}
525
529void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
530 cl_int err;
531
532 if (math_program == NULL)
534
535 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
536 CL_CHECK(err);
537
538 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
539 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
540 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
541
542 const int nb = ((*n) + 256 - 1) / 256;
543 const size_t global_item_size = 256 * nb;
544 const size_t local_item_size = 256;
545
548 0, NULL, NULL));
550}
551
556void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
557 cl_int err;
558
559 if (math_program == NULL)
561
562 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
563 CL_CHECK(err);
564
565 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
566 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
567 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
568
569 const int nb = ((*n) + 256 - 1) / 256;
570 const size_t global_item_size = 256 * nb;
571 const size_t local_item_size = 256;
572
575 0, NULL, NULL));
577}
578
583void opencl_add3(void *a, void *b, void *c, int *n,
585 cl_int err;
586
587 if (math_program == NULL)
589
590 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
591 CL_CHECK(err);
592
593 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
594 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
595 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
596 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
597
598 const int nb = ((*n) + 256 - 1) / 256;
599 const size_t global_item_size = 256 * nb;
600 const size_t local_item_size = 256;
601
604 0, NULL, NULL));
606}
607
612void opencl_add4(void *a, void *b, void *c, void *d, int *n,
614 cl_int err;
615
616 if (math_program == NULL)
618
619 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
620 CL_CHECK(err);
621
622 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
623 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
624 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
625 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
626 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
627
628 const int nb = ((*n) + 256 - 1) / 256;
629 const size_t global_item_size = 256 * nb;
630 const size_t local_item_size = 256;
631
634 0, NULL, NULL));
636}
637
643void opencl_add2s1(void *a, void *b, real *c1, int *n,
645 cl_int err;
646
647 if (math_program == NULL)
649
650 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
651 CL_CHECK(err);
652
653 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
654 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
655 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
656 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
657
658 const int nb = ((*n) + 256 - 1) / 256;
659 const size_t global_item_size = 256 * nb;
660 const size_t local_item_size = 256;
661
664 0, NULL, NULL));
666}
667
673void opencl_add2s2(void *a, void *b, real *c1, int *n,
675 cl_int err;
676
677 if (math_program == NULL)
679
680 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
681 CL_CHECK(err);
682
683 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
684 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
685 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
686 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
687
688 const int nb = ((*n) + 256 - 1) / 256;
689 const size_t global_item_size = 256 * nb;
690 const size_t local_item_size = 256;
691
694 0, NULL, NULL));
696}
697
704void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
706 cl_int err;
707
708 if (math_program == NULL)
710
711 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
712 CL_CHECK(err);
713
714 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
715 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
716 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
717 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
718 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
719
720 const int nb = ((*n) + 256 - 1) / 256;
721 const size_t global_item_size = 256 * nb;
722 const size_t local_item_size = 256;
723
726 0, NULL, NULL));
728
729}
730
736void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
738 cl_int err;
739
740 if (math_program == NULL)
742
743 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
744 CL_CHECK(err);
745
746 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
747 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
748 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
749 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
750
751 const int nb = ((*n) + 256 - 1) / 256;
752 const size_t global_item_size = 256 * nb;
753 const size_t local_item_size = 256;
754
757 0, NULL, NULL));
759}
760
765void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
767 cl_int err;
768
769 if (math_program == NULL)
771
772 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
773 CL_CHECK(err);
774
775 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
776 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
777 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
778 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
779 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
780 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
781
782 const int nb = ((*n) + 256 - 1) / 256;
783 const size_t global_item_size = 256 * nb;
784 const size_t local_item_size = 256;
785
788 0, NULL, NULL));
790}
791
796void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
797 real *c3, int *n, cl_command_queue cmd_queue) {
798 cl_int err;
799
800 if (math_program == NULL)
802
803 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
804 CL_CHECK(err);
805
806 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
807 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
808 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
809 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
810 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
811 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
812 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
813 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
814
815 const int nb = ((*n) + 256 - 1) / 256;
816 const size_t global_item_size = 256 * nb;
817 const size_t local_item_size = 256;
818
821 0, NULL, NULL));
823}
824
829void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
830 real *c2, real *c3, real * c4, int *n,
832 cl_int err;
833
834 if (math_program == NULL)
836
837 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
838 CL_CHECK(err);
839
840 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
841 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
842 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
843 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
844 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
845 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
846 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
847 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
848 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
849 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
850
851 const int nb = ((*n) + 256 - 1) / 256;
852 const size_t global_item_size = 256 * nb;
853 const size_t local_item_size = 256;
854
857 0, NULL, NULL));
859}
860
866 cl_int err;
867
868 if (math_program == NULL)
870
871 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
872 CL_CHECK(err);
873
874 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
875 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
876
877 const int nb = ((*n) + 256 - 1) / 256;
878 const size_t global_item_size = 256 * nb;
879 const size_t local_item_size = 256;
880
883 0, NULL, NULL));
885}
886
891void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
892 cl_int err;
893
894 if (math_program == NULL)
896
897 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
898 CL_CHECK(err);
899
900 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
901 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
902 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
903
904 const int nb = ((*n) + 256 - 1) / 256;
905 const size_t global_item_size = 256 * nb;
906 const size_t local_item_size = 256;
907
910 0, NULL, NULL));
912}
913
918void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
919 cl_int err;
920
921 if (math_program == NULL)
923
924 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
925 CL_CHECK(err);
926
927 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
928 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
929 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
930
931 const int nb = ((*n) + 256 - 1) / 256;
932 const size_t global_item_size = 256 * nb;
933 const size_t local_item_size = 256;
934
937 0, NULL, NULL));
939}
940
945void opencl_col3(void *a, void *b, void *c, int *n,
947 cl_int err;
948
949 if (math_program == NULL)
951
952 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
953 CL_CHECK(err);
954
955 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
956 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
957 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
958 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
959
960 const int nb = ((*n) + 256 - 1) / 256;
961 const size_t global_item_size = 256 * nb;
962 const size_t local_item_size = 256;
963
966 0, NULL, NULL));
968}
969
974void opencl_subcol3(void *a, void *b, void *c, int *n,
976 cl_int err;
977
978 if (math_program == NULL)
980
981 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
982 CL_CHECK(err);
983
984 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
985 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
986 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
987 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
988
989 const int nb = ((*n) + 256 - 1) / 256;
990 const size_t global_item_size = 256 * nb;
991 const size_t local_item_size = 256;
992
995 0, NULL, NULL));
997}
998
1003void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1004 cl_int err;
1005
1006 if (math_program == NULL)
1008
1009 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
1010 CL_CHECK(err);
1011
1012 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1013 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1014 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1015
1016 const int nb = ((*n) + 256 - 1) / 256;
1017 const size_t global_item_size = 256 * nb;
1018 const size_t local_item_size = 256;
1019
1022 0, NULL, NULL));
1024}
1025
1030void opencl_sub3(void *a, void *b, void *c, int *n,
1032 cl_int err;
1033
1034 if (math_program == NULL)
1036
1037 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
1038 CL_CHECK(err);
1039
1040 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1041 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1042 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1043 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1044
1045 const int nb = ((*n) + 256 - 1) / 256;
1046 const size_t global_item_size = 256 * nb;
1047 const size_t local_item_size = 256;
1048
1051 0, NULL, NULL));
1053}
1054
1059void opencl_addcol3(void *a, void *b, void *c, int *n,
1061 cl_int err;
1062
1063 if (math_program == NULL)
1065
1066 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
1067 CL_CHECK(err);
1068
1069 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1070 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1071 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1072 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1073
1074 const int nb = ((*n) + 256 - 1) / 256;
1075 const size_t global_item_size = 256 * nb;
1076 const size_t local_item_size = 256;
1077
1080 0, NULL, NULL));
1082}
1083
1088void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
1090 cl_int err;
1091
1092 if (math_program == NULL)
1094
1095 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
1096 CL_CHECK(err);
1097
1098 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1099 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1100 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1101 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
1102 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1103
1104 const int nb = ((*n) + 256 - 1) / 256;
1105 const size_t global_item_size = 256 * nb;
1106 const size_t local_item_size = 256;
1107
1110 0, NULL, NULL));
1112}
1113
1118void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
1120 cl_int err;
1121
1122 if (math_program == NULL)
1124
1125 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
1126 CL_CHECK(err);
1127
1128 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1129 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1130 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1131 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
1132 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1133
1134 const int nb = ((*n) + 256 - 1) / 256;
1135 const size_t global_item_size = 256 * nb;
1136 const size_t local_item_size = 256;
1137
1140 0, NULL, NULL));
1142}
1143
1149void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
1150 void *v1, void *v2, void *v3, int *n,
1152 cl_int err;
1153
1154 if (math_program == NULL)
1156
1157 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
1158 CL_CHECK(err);
1159
1160 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
1161 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
1162 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
1163 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
1164 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
1165 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
1166 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
1167 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
1168
1169 const int nb = ((*n) + 256 - 1) / 256;
1170 const size_t global_item_size = 256 * nb;
1171 const size_t local_item_size = 256;
1172
1175 0, NULL, NULL));
1177}
1178
1184void opencl_vcross(void *u1, void *u2, void *u3,
1185 void *v1, void *v2, void *v3,
1186 void *w1, void *w2, void *w3,
1187 int *n, cl_command_queue cmd_queue) {
1188
1189 cl_int err;
1190
1191 if (math_program == NULL)
1193
1194 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1195 CL_CHECK(err);
1196
1197 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1198 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1199 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1200 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1201 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1202 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1203 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1204 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1205 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1206 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1207
1208 const int nb = ((*n) + 256 - 1) / 256;
1209 const size_t global_item_size = 256 * nb;
1210 const size_t local_item_size = 256;
1211
1214 0, NULL, NULL));
1216
1217}
1218
1220int red_s = 0;
1223
1228real opencl_glsc3(void *a, void *b, void *c, int *n,
1230 cl_int err;
1232 int i;
1233
1234 if (math_program == NULL)
1236
1237 const int nb = ((*n) + 256 - 1) / 256;
1238 const size_t global_item_size = 256 * nb;
1239 const size_t local_item_size = 256;
1240
1241 if ( nb > red_s){
1242 red_s = nb;
1243 if (bufred != NULL) {
1244 free(bufred);
1246 }
1247 bufred = (real *) malloc(nb * sizeof(real));
1248
1250 nb * sizeof(real), NULL, &err);
1251 CL_CHECK(err);
1252 }
1253
1254 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1255 CL_CHECK(err);
1256
1257 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1258 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1259 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1260 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1261 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1262
1265 0, NULL, &kern_wait));
1266
1268 nb * sizeof(real), bufred, 1,
1269 &kern_wait, NULL));
1270
1271 real res = 0.0;
1272 for (i = 0; i < nb; i++) {
1273 res += bufred[i];
1274 }
1275
1277
1278 return res;
1279}
1280
1285void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1287 int i, k;
1288 cl_int err;
1290
1291 if (math_program == NULL)
1293
1294 int pow2 = 1;
1295 while(pow2 < (*j)){
1296 pow2 = 2*pow2;
1297 }
1298
1299 const int nt = 256 / pow2;
1300 const int nb = ((*n) + nt - 1) / nt;
1301 const size_t local_item_size[2] = {nt, pow2};
1302 const size_t global_item_size[2] = {nb * nt, pow2};
1303
1304 if((*j)*nb > red_s) {
1305 red_s = (*j)*nb;
1306 if (bufred != NULL) {
1307 free(bufred);
1309 }
1310 bufred = (real *) malloc((*j) * nb * sizeof(real));
1311
1313 (*j) * nb * sizeof(real), NULL, &err);
1314 CL_CHECK(err);
1315 }
1316
1317 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1318 CL_CHECK(err);
1319
1320 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1321 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1322 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1323 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1324 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1325 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1326
1329 0, NULL, &kern_wait));
1330
1332 (*j) * nb * sizeof(real),
1333 bufred, 1, &kern_wait, NULL));
1334
1335 for (k = 0; k < (*j); k++) {
1336 h[k] = 0.0;
1337 }
1338
1339 for (i = 0; i < nb; i++) {
1340 for (k = 0; k < (*j); k++) {
1341 h[k] += bufred[i*(*j)+k];
1342 }
1343 }
1344
1346}
1347
1353 cl_int err;
1355 int i;
1356
1357 if (math_program == NULL)
1359
1360 const int nb = ((*n) + 256 - 1) / 256;
1361 const size_t global_item_size = 256 * nb;
1362 const size_t local_item_size = 256;
1363
1364 real * buf = (real *) malloc(nb * sizeof(real));
1365
1366 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1367 CL_CHECK(err);
1368
1370 nb * sizeof(real), NULL, &err);
1371 CL_CHECK(err);
1372
1373 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1374 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1375 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1376 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1377
1380 0, NULL, &kern_wait));
1381
1383 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1384
1385 real res = 0.0;
1386 for (i = 0; i < nb; i++) {
1387 res += buf[i];
1388 }
1389
1390 free(buf);
1394
1395 return res;
1396}
1397
1403 cl_int err;
1405 int i;
1406
1407 if (math_program == NULL)
1409
1410 const int nb = ((*n) + 256 - 1) / 256;
1411 const size_t global_item_size = 256 * nb;
1412 const size_t local_item_size = 256;
1413
1414 real * buf = (real *) malloc(nb * sizeof(real));
1415
1416 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1417 CL_CHECK(err);
1418
1420 nb * sizeof(real), NULL, &err);
1421 CL_CHECK(err);
1422
1423 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1424 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1425 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1426 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1427
1430 0, NULL, &kern_wait));
1431
1433 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1434
1435 real res = 0.0;
1436 for (i = 0; i < nb; i++) {
1437 res += buf[i];
1438 }
1439
1440 free(buf);
1444
1445 return res;
1446}
1447
1453 cl_int err;
1455 int i;
1456
1457 if (math_program == NULL)
1459
1460 const int nb = ((*n) + 256 - 1) / 256;
1461 const size_t global_item_size = 256 * nb;
1462 const size_t local_item_size = 256;
1463
1464 real * buf = (real *) malloc(nb * sizeof(real));
1465
1466 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1467 CL_CHECK(err);
1468
1470 nb * sizeof(real), NULL, &err);
1471 CL_CHECK(err);
1472
1473 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1474 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1475 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1476
1479 0, NULL, &kern_wait));
1480
1482 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1483
1484 real res = 0.0;
1485 for (i = 0; i < nb; i++) {
1486 res += buf[i];
1487 }
1488
1489 free(buf);
1493
1494 return res;
1495}
1496
1498 cl_int err;
1500 int i;
1501
1502 if (*n <= 0) {
1503 return -((real) HUGE_VAL);
1504 }
1505
1506 if (math_program == NULL)
1508
1509 const int nb = ((*n) + 256 - 1) / 256;
1510 const size_t global_item_size = 256 * nb;
1511 const size_t local_item_size = 256;
1512
1513 real * buf = (real *) malloc(nb * sizeof(real));
1514
1515 cl_kernel kernel = clCreateKernel(math_program, "glmax_kernel", &err);
1516 CL_CHECK(err);
1517
1519 nb * sizeof(real), NULL, &err);
1520 CL_CHECK(err);
1521
1522 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1523 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1524 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1525
1528 0, NULL, &kern_wait));
1529
1531 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1532
1533 real res = buf[0];
1534 for (i = 1; i < nb; i++) {
1535 res = fmax(res, buf[i]);
1536 }
1537
1538 free(buf);
1542
1543 return res;
1544}
1545
1547 cl_int err;
1549 int i;
1550
1551 if (*n <= 0) {
1552 return (real) HUGE_VAL;
1553 }
1554
1555 if (math_program == NULL)
1557
1558 const int nb = ((*n) + 256 - 1) / 256;
1559 const size_t global_item_size = 256 * nb;
1560 const size_t local_item_size = 256;
1561
1562 real * buf = (real *) malloc(nb * sizeof(real));
1563
1564 cl_kernel kernel = clCreateKernel(math_program, "glmin_kernel", &err);
1565 CL_CHECK(err);
1566
1568 nb * sizeof(real), NULL, &err);
1569 CL_CHECK(err);
1570
1571 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1572 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1573 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1574
1577 0, NULL, &kern_wait));
1578
1580 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1581
1582 real res = buf[0];
1583 for (i = 1; i < nb; i++) {
1584 res = fmin(res, buf[i]);
1585 }
1586
1587 free(buf);
1591
1592 return res;
1593}
1594
1595
1600 cl_int err;
1601
1602 if (math_program == NULL)
1604
1605 cl_kernel kernel = clCreateKernel(math_program, "absval_kernel", &err);
1606 CL_CHECK(err);
1607
1608 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1609 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
1610
1611 const int nb = ((*n) + 256 - 1) / 256;
1612 const size_t global_item_size = 256 * nb;
1613 const size_t local_item_size = 256;
1614
1617 0, NULL, NULL));
1618}
1619
1623void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1624 cl_int err;
1625
1626 if (math_program == NULL)
1628
1629 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1630 CL_CHECK(err);
1631
1632 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1633 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1634 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1635
1636 const int nb = ((*n) + 256 - 1) / 256;
1637 const size_t global_item_size = 256 * nb;
1638 const size_t local_item_size = 256;
1639
1642 0, NULL, NULL));
1644}
1645
1650void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1651 cl_int err;
1652
1653 if (math_program == NULL)
1655
1656 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec2_kernel", &err);
1657 CL_CHECK(err);
1658
1659 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1660 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1661 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1662
1663 const int nb = ((*n) + 256 - 1) / 256;
1664 const size_t global_item_size = 256 * nb;
1665 const size_t local_item_size = 256;
1666
1669 0, NULL, NULL));
1671}
1672
1677void opencl_pwmax_vec3(void *a, void *b, void *c,
1678 int *n, cl_command_queue cmd_queue) {
1679 cl_int err;
1680
1681 if (math_program == NULL)
1683
1684 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec3_kernel", &err);
1685 CL_CHECK(err);
1686
1687 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1688 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1689 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1690 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1691
1692 const int nb = ((*n) + 256 - 1) / 256;
1693 const size_t global_item_size = 256 * nb;
1694 const size_t local_item_size = 256;
1695
1698 0, NULL, NULL));
1700}
1701
1707 cl_int err;
1708
1709 if (math_program == NULL)
1711
1712 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca2_kernel", &err);
1713 CL_CHECK(err);
1714
1715 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1716 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1717 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1718
1719 const int nb = ((*n) + 256 - 1) / 256;
1720 const size_t global_item_size = 256 * nb;
1721 const size_t local_item_size = 256;
1722
1725 0, NULL, NULL));
1727}
1728
1733void opencl_pwmax_sca3(void *a, void *b, real *c,
1734 int *n, cl_command_queue cmd_queue) {
1735 cl_int err;
1736
1737 if (math_program == NULL)
1739
1740 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca3_kernel", &err);
1741 CL_CHECK(err);
1742
1743 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1744 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1745 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1746 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1747
1748 const int nb = ((*n) + 256 - 1) / 256;
1749 const size_t global_item_size = 256 * nb;
1750 const size_t local_item_size = 256;
1751
1754 0, NULL, NULL));
1756}
1757
1762void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1763 cl_int err;
1764
1765 if (math_program == NULL)
1767
1768 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec2_kernel", &err);
1769 CL_CHECK(err);
1770
1771 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1772 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1773 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1774
1775 const int nb = ((*n) + 256 - 1) / 256;
1776 const size_t global_item_size = 256 * nb;
1777 const size_t local_item_size = 256;
1778
1781 0, NULL, NULL));
1783}
1784
1789void opencl_pwmin_vec3(void *a, void *b, void *c,
1790 int *n, cl_command_queue cmd_queue) {
1791 cl_int err;
1792
1793 if (math_program == NULL)
1795
1796 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec3_kernel", &err);
1797 CL_CHECK(err);
1798
1799 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1800 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1801 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1802 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1803
1804 const int nb = ((*n) + 256 - 1) / 256;
1805 const size_t global_item_size = 256 * nb;
1806 const size_t local_item_size = 256;
1807
1810 0, NULL, NULL));
1812}
1813
1819 cl_int err;
1820
1821 if (math_program == NULL)
1823
1824 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca2_kernel", &err);
1825 CL_CHECK(err);
1826
1827 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1828 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1829 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1830
1831 const int nb = ((*n) + 256 - 1) / 256;
1832 const size_t global_item_size = 256 * nb;
1833 const size_t local_item_size = 256;
1834
1837 0, NULL, NULL));
1839}
1840
1845void opencl_pwmin_sca3(void *a, void *b, real *c,
1846 int *n, cl_command_queue cmd_queue) {
1847 cl_int err;
1848
1849 if (math_program == NULL)
1851
1852 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca3_kernel", &err);
1853 CL_CHECK(err);
1854
1855 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1856 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1857 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1858 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1859
1860 const int nb = ((*n) + 256 - 1) / 256;
1861 const size_t global_item_size = 256 * nb;
1862 const size_t local_item_size = 256;
1863
1866 0, NULL, NULL));
1868}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1623
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:945
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:393
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:224
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1706
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:1184
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:124
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1003
void opencl_face_masked_gather_copy(void *a, void *b, void *mask, void *facet, int *n1, int *n2, int *lx, int *ly, int *lz, int *m, cl_command_queue cmd_queue)
Definition math.c:186
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:918
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:704
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1030
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:643
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:1118
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:865
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:583
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1228
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:327
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:829
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:796
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:339
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:286
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:473
void opencl_masked_scatter_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:255
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1845
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1677
real * bufred
Definition math.c:1221
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1452
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:612
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1789
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:447
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:673
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:1149
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1762
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:765
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1285
void opencl_masked_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:94
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1402
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:736
int red_s
Definition math.c:1220
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1599
void opencl_cwrap(void *a, real *min_val, real *max_val, int *n, cl_command_queue cmd_queue)
Definition math.c:501
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1059
cl_mem bufred_d
Definition math.c:1222
real opencl_glmin(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1546
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1650
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:315
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:54
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:974
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:556
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:1088
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1818
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1733
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:891
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1352
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:419
void opencl_masked_copy_0(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:64
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:529
void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:155
real opencl_glmax(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1497
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:365
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program