Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <math.h>
45#include <device/opencl/jit.h>
47#include <device/opencl/check.h>
48
49#include "math_kernel.cl.h"
50
54void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
56 b, a, 0, 0, (*n) * sizeof(real),
57 0, NULL, NULL));
58}
59
63void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
65 cl_int err;
66
67 if (math_program == NULL)
69
70 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
72
73 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
74 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
75 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
76 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
77 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
78
79 const int nb = ((*n) + 256 - 1) / 256;
80 const size_t global_item_size = 256 * nb;
81 const size_t local_item_size = 256;
82
85 0, NULL, NULL));
87
88}
89
93void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
95 cl_int err;
96
97 if (math_program == NULL)
99
100 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
101 &err);
102 CL_CHECK(err);
103
104 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
105 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
106 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
107 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
108 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
109
110 const int nb = ((*n) + 256 - 1) / 256;
111 const size_t global_item_size = 256 * nb;
112 const size_t local_item_size = 256;
113
116 0, NULL, NULL));
118
119}
120
124void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n,
125 int *m, cl_command_queue cmd_queue) {
126 cl_int err;
127
128 if (math_program == NULL)
130
132 "masked_gather_copy_aligned_kernel", &err);
133 CL_CHECK(err);
134
135 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
136 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
137 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
138 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
139 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
140
141 const int nb = ((*n) + 256 - 1) / 256;
142 const size_t global_item_size = 256 * nb;
143 const size_t local_item_size = 256;
144
147 0, NULL, NULL));
149
150}
151
155void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
157 cl_int err;
158
159 if (math_program == NULL)
161
162 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
163 &err);
164 CL_CHECK(err);
165
166 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
167 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
168 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
169 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
170 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
171
172 const int nb = ((*n) + 256 - 1) / 256;
173 const size_t global_item_size = 256 * nb;
174 const size_t local_item_size = 256;
175
178 0, NULL, NULL));
180
181}
182
186void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
188 cl_int err;
189
190 if (math_program == NULL)
192
193 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
194 CL_CHECK(err);
195
196 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
197 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
198 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
199 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
200 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
201
202 const int nb = ((*mask_size) + 256 - 1) / 256;
203 const size_t global_item_size = 256 * nb;
204 const size_t local_item_size = 256;
205
208 0, NULL, NULL));
210}
211
217 real zero = 0.0;
218
220 (*n) * sizeof(real), 0, NULL, &wait_kern));
222}
223
229 real one = 1.0;
230
232 (*n) * sizeof(real), 0, NULL, &wait_kern));
234}
235
239void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
240 cl_int err;
241
242 if (math_program == NULL)
244
245 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
246 CL_CHECK(err);
247
248 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
249 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
250 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
251
252 const int nb = ((*n) + 256 - 1) / 256;
253 const size_t global_item_size = 256 * nb;
254 const size_t local_item_size = 256;
255
258 0, NULL, NULL));
260}
261
265void opencl_cmult2(void *a, void *b, real *c, int *n,
267 cl_int err;
268
269 if (math_program == NULL)
271
272 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
273 CL_CHECK(err);
274
275 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
276 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
277 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
278 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
279
280 const int nb = ((*n) + 256 - 1) / 256;
281 const size_t global_item_size = 256 * nb;
282 const size_t local_item_size = 256;
283
286 0, NULL, NULL));
288}
289
293void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
294 cl_int err;
295
296 if (math_program == NULL)
298
299 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
300 CL_CHECK(err);
301
302 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
303 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
304 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
305
306 const int nb = ((*n) + 256 - 1) / 256;
307 const size_t global_item_size = 256 * nb;
308 const size_t local_item_size = 256;
309
312 0, NULL, NULL));
314}
315
319void opencl_cdiv2(void *a, void *b, real *c, int *n,
321 cl_int err;
322
323 if (math_program == NULL)
325
326 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
327 CL_CHECK(err);
328
329 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
330 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
331 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
332 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
333
334 const int nb = ((*n) + 256 - 1) / 256;
335 const size_t global_item_size = 256 * nb;
336 const size_t local_item_size = 256;
337
340 0, NULL, NULL));
342}
343
347void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
348 cl_int err;
349
350 if (math_program == NULL)
352
353 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
354 CL_CHECK(err);
355
356 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
357 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
358 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
359
360 const int nb = ((*n) + 256 - 1) / 256;
361 const size_t global_item_size = 256 * nb;
362 const size_t local_item_size = 256;
363
366 0, NULL, NULL));
368}
369
373void opencl_cadd2(void *a, void *b, real *c, int *n,
375 cl_int err;
376
377 if (math_program == NULL)
379
380 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
381 CL_CHECK(err);
382
383 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
384 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
385 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
386 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
387
388 const int nb = ((*n) + 256 - 1) / 256;
389 const size_t global_item_size = 256 * nb;
390 const size_t local_item_size = 256;
391
394 0, NULL, NULL));
396}
397
401void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
402 cl_int err;
403
404 if (math_program == NULL)
406
407 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
408 CL_CHECK(err);
409
410 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
411 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
412 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
413
414 const int nb = ((*n) + 256 - 1) / 256;
415 const size_t global_item_size = 256 * nb;
416 const size_t local_item_size = 256;
417
420 0, NULL, NULL));
422}
423
428void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
429 cl_int err;
430
431 if (math_program == NULL)
433
434 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
435 CL_CHECK(err);
436
437 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
438 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
439 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
440
441 const int nb = ((*n) + 256 - 1) / 256;
442 const size_t global_item_size = 256 * nb;
443 const size_t local_item_size = 256;
444
447 0, NULL, NULL));
449}
450
455void opencl_add3(void *a, void *b, void *c, int *n,
457 cl_int err;
458
459 if (math_program == NULL)
461
462 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
463 CL_CHECK(err);
464
465 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
466 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
467 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
468 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
469
470 const int nb = ((*n) + 256 - 1) / 256;
471 const size_t global_item_size = 256 * nb;
472 const size_t local_item_size = 256;
473
476 0, NULL, NULL));
478}
479
484void opencl_add4(void *a, void *b, void *c, void *d, int *n,
486 cl_int err;
487
488 if (math_program == NULL)
490
491 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
492 CL_CHECK(err);
493
494 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
495 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
496 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
497 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
498 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
499
500 const int nb = ((*n) + 256 - 1) / 256;
501 const size_t global_item_size = 256 * nb;
502 const size_t local_item_size = 256;
503
506 0, NULL, NULL));
508}
509
515void opencl_add2s1(void *a, void *b, real *c1, int *n,
517 cl_int err;
518
519 if (math_program == NULL)
521
522 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
523 CL_CHECK(err);
524
525 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
526 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
527 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
528 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
529
530 const int nb = ((*n) + 256 - 1) / 256;
531 const size_t global_item_size = 256 * nb;
532 const size_t local_item_size = 256;
533
536 0, NULL, NULL));
538}
539
545void opencl_add2s2(void *a, void *b, real *c1, int *n,
547 cl_int err;
548
549 if (math_program == NULL)
551
552 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
553 CL_CHECK(err);
554
555 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
556 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
557 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
558 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
559
560 const int nb = ((*n) + 256 - 1) / 256;
561 const size_t global_item_size = 256 * nb;
562 const size_t local_item_size = 256;
563
566 0, NULL, NULL));
568}
569
576void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
578 cl_int err;
579
580 if (math_program == NULL)
582
583 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
584 CL_CHECK(err);
585
586 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
587 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
588 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
589 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
590 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
591
592 const int nb = ((*n) + 256 - 1) / 256;
593 const size_t global_item_size = 256 * nb;
594 const size_t local_item_size = 256;
595
598 0, NULL, NULL));
600
601}
602
608void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
610 cl_int err;
611
612 if (math_program == NULL)
614
615 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
616 CL_CHECK(err);
617
618 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
619 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
620 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
621 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
622
623 const int nb = ((*n) + 256 - 1) / 256;
624 const size_t global_item_size = 256 * nb;
625 const size_t local_item_size = 256;
626
629 0, NULL, NULL));
631}
632
637void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
639 cl_int err;
640
641 if (math_program == NULL)
643
644 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
645 CL_CHECK(err);
646
647 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
648 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
649 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
650 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
651 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
652 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
653
654 const int nb = ((*n) + 256 - 1) / 256;
655 const size_t global_item_size = 256 * nb;
656 const size_t local_item_size = 256;
657
660 0, NULL, NULL));
662}
663
668void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
669 real *c3, int *n, cl_command_queue cmd_queue) {
670 cl_int err;
671
672 if (math_program == NULL)
674
675 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
676 CL_CHECK(err);
677
678 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
679 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
680 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
681 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
682 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
683 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
684 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
685 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
686
687 const int nb = ((*n) + 256 - 1) / 256;
688 const size_t global_item_size = 256 * nb;
689 const size_t local_item_size = 256;
690
693 0, NULL, NULL));
695}
696
701void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
702 real *c2, real *c3, real * c4, int *n,
704 cl_int err;
705
706 if (math_program == NULL)
708
709 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
710 CL_CHECK(err);
711
712 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
713 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
714 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
715 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
716 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
717 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
718 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
719 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
720 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
721 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
722
723 const int nb = ((*n) + 256 - 1) / 256;
724 const size_t global_item_size = 256 * nb;
725 const size_t local_item_size = 256;
726
729 0, NULL, NULL));
731}
732
738 cl_int err;
739
740 if (math_program == NULL)
742
743 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
744 CL_CHECK(err);
745
746 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
747 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
748
749 const int nb = ((*n) + 256 - 1) / 256;
750 const size_t global_item_size = 256 * nb;
751 const size_t local_item_size = 256;
752
755 0, NULL, NULL));
757}
758
763void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
764 cl_int err;
765
766 if (math_program == NULL)
768
769 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
770 CL_CHECK(err);
771
772 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
773 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
774 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
775
776 const int nb = ((*n) + 256 - 1) / 256;
777 const size_t global_item_size = 256 * nb;
778 const size_t local_item_size = 256;
779
782 0, NULL, NULL));
784}
785
790void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
791 cl_int err;
792
793 if (math_program == NULL)
795
796 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
797 CL_CHECK(err);
798
799 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
800 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
801 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
802
803 const int nb = ((*n) + 256 - 1) / 256;
804 const size_t global_item_size = 256 * nb;
805 const size_t local_item_size = 256;
806
809 0, NULL, NULL));
811}
812
817void opencl_col3(void *a, void *b, void *c, int *n,
819 cl_int err;
820
821 if (math_program == NULL)
823
824 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
825 CL_CHECK(err);
826
827 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
828 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
829 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
830 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
831
832 const int nb = ((*n) + 256 - 1) / 256;
833 const size_t global_item_size = 256 * nb;
834 const size_t local_item_size = 256;
835
838 0, NULL, NULL));
840}
841
846void opencl_subcol3(void *a, void *b, void *c, int *n,
848 cl_int err;
849
850 if (math_program == NULL)
852
853 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
854 CL_CHECK(err);
855
856 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
857 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
858 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
859 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
860
861 const int nb = ((*n) + 256 - 1) / 256;
862 const size_t global_item_size = 256 * nb;
863 const size_t local_item_size = 256;
864
867 0, NULL, NULL));
869}
870
875void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
876 cl_int err;
877
878 if (math_program == NULL)
880
881 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
882 CL_CHECK(err);
883
884 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
885 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
886 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
887
888 const int nb = ((*n) + 256 - 1) / 256;
889 const size_t global_item_size = 256 * nb;
890 const size_t local_item_size = 256;
891
894 0, NULL, NULL));
896}
897
902void opencl_sub3(void *a, void *b, void *c, int *n,
904 cl_int err;
905
906 if (math_program == NULL)
908
909 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
910 CL_CHECK(err);
911
912 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
913 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
914 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
915 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
916
917 const int nb = ((*n) + 256 - 1) / 256;
918 const size_t global_item_size = 256 * nb;
919 const size_t local_item_size = 256;
920
923 0, NULL, NULL));
925}
926
931void opencl_addcol3(void *a, void *b, void *c, int *n,
933 cl_int err;
934
935 if (math_program == NULL)
937
938 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
939 CL_CHECK(err);
940
941 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
942 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
943 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
944 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
945
946 const int nb = ((*n) + 256 - 1) / 256;
947 const size_t global_item_size = 256 * nb;
948 const size_t local_item_size = 256;
949
952 0, NULL, NULL));
954}
955
960void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
962 cl_int err;
963
964 if (math_program == NULL)
966
967 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
968 CL_CHECK(err);
969
970 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
971 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
972 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
973 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
974 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
975
976 const int nb = ((*n) + 256 - 1) / 256;
977 const size_t global_item_size = 256 * nb;
978 const size_t local_item_size = 256;
979
982 0, NULL, NULL));
984}
985
990void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
992 cl_int err;
993
994 if (math_program == NULL)
996
997 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
998 CL_CHECK(err);
999
1000 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1001 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1002 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1003 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
1004 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1005
1006 const int nb = ((*n) + 256 - 1) / 256;
1007 const size_t global_item_size = 256 * nb;
1008 const size_t local_item_size = 256;
1009
1012 0, NULL, NULL));
1014}
1015
1021void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
1022 void *v1, void *v2, void *v3, int *n,
1024 cl_int err;
1025
1026 if (math_program == NULL)
1028
1029 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
1030 CL_CHECK(err);
1031
1032 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
1033 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
1034 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
1035 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
1036 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
1037 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
1038 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
1039 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
1040
1041 const int nb = ((*n) + 256 - 1) / 256;
1042 const size_t global_item_size = 256 * nb;
1043 const size_t local_item_size = 256;
1044
1047 0, NULL, NULL));
1049}
1050
1056void opencl_vcross(void *u1, void *u2, void *u3,
1057 void *v1, void *v2, void *v3,
1058 void *w1, void *w2, void *w3,
1059 int *n, cl_command_queue cmd_queue) {
1060
1061 cl_int err;
1062
1063 if (math_program == NULL)
1065
1066 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1067 CL_CHECK(err);
1068
1069 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1070 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1071 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1072 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1073 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1074 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1075 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1076 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1077 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1078 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1079
1080 const int nb = ((*n) + 256 - 1) / 256;
1081 const size_t global_item_size = 256 * nb;
1082 const size_t local_item_size = 256;
1083
1086 0, NULL, NULL));
1088
1089}
1090
1092int red_s = 0;
1095
1100real opencl_glsc3(void *a, void *b, void *c, int *n,
1102 cl_int err;
1104 int i;
1105
1106 if (math_program == NULL)
1108
1109 const int nb = ((*n) + 256 - 1) / 256;
1110 const size_t global_item_size = 256 * nb;
1111 const size_t local_item_size = 256;
1112
1113 if ( nb > red_s){
1114 red_s = nb;
1115 if (bufred != NULL) {
1116 free(bufred);
1118 }
1119 bufred = (real *) malloc(nb * sizeof(real));
1120
1122 nb * sizeof(real), NULL, &err);
1123 CL_CHECK(err);
1124 }
1125
1126 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1127 CL_CHECK(err);
1128
1129 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1130 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1131 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1132 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1133 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1134
1137 0, NULL, &kern_wait));
1138
1140 nb * sizeof(real), bufred, 1,
1141 &kern_wait, NULL));
1142
1143 real res = 0.0;
1144 for (i = 0; i < nb; i++) {
1145 res += bufred[i];
1146 }
1147
1149
1150 return res;
1151}
1152
1157void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1159 int i, k;
1160 cl_int err;
1162
1163 if (math_program == NULL)
1165
1166 int pow2 = 1;
1167 while(pow2 < (*j)){
1168 pow2 = 2*pow2;
1169 }
1170
1171 const int nt = 256 / pow2;
1172 const int nb = ((*n) + nt - 1) / nt;
1173 const size_t local_item_size[2] = {nt, pow2};
1174 const size_t global_item_size[2] = {nb * nt, pow2};
1175
1176 if((*j)*nb > red_s) {
1177 red_s = (*j)*nb;
1178 if (bufred != NULL) {
1179 free(bufred);
1181 }
1182 bufred = (real *) malloc((*j) * nb * sizeof(real));
1183
1185 (*j) * nb * sizeof(real), NULL, &err);
1186 CL_CHECK(err);
1187 }
1188
1189 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1190 CL_CHECK(err);
1191
1192 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1193 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1194 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1195 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1196 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1197 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1198
1201 0, NULL, &kern_wait));
1202
1204 (*j) * nb * sizeof(real),
1205 bufred, 1, &kern_wait, NULL));
1206
1207 for (k = 0; k < (*j); k++) {
1208 h[k] = 0.0;
1209 }
1210
1211 for (i = 0; i < nb; i++) {
1212 for (k = 0; k < (*j); k++) {
1213 h[k] += bufred[i*(*j)+k];
1214 }
1215 }
1216
1218}
1219
1225 cl_int err;
1227 int i;
1228
1229 if (math_program == NULL)
1231
1232 const int nb = ((*n) + 256 - 1) / 256;
1233 const size_t global_item_size = 256 * nb;
1234 const size_t local_item_size = 256;
1235
1236 real * buf = (real *) malloc(nb * sizeof(real));
1237
1238 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1239 CL_CHECK(err);
1240
1242 nb * sizeof(real), NULL, &err);
1243 CL_CHECK(err);
1244
1245 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1246 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1247 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1248 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1249
1252 0, NULL, &kern_wait));
1253
1255 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1256
1257 real res = 0.0;
1258 for (i = 0; i < nb; i++) {
1259 res += buf[i];
1260 }
1261
1262 free(buf);
1266
1267 return res;
1268}
1269
1275 cl_int err;
1277 int i;
1278
1279 if (math_program == NULL)
1281
1282 const int nb = ((*n) + 256 - 1) / 256;
1283 const size_t global_item_size = 256 * nb;
1284 const size_t local_item_size = 256;
1285
1286 real * buf = (real *) malloc(nb * sizeof(real));
1287
1288 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1289 CL_CHECK(err);
1290
1292 nb * sizeof(real), NULL, &err);
1293 CL_CHECK(err);
1294
1295 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1296 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1297 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1298 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1299
1302 0, NULL, &kern_wait));
1303
1305 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1306
1307 real res = 0.0;
1308 for (i = 0; i < nb; i++) {
1309 res += buf[i];
1310 }
1311
1312 free(buf);
1316
1317 return res;
1318}
1319
1325 cl_int err;
1327 int i;
1328
1329 if (math_program == NULL)
1331
1332 const int nb = ((*n) + 256 - 1) / 256;
1333 const size_t global_item_size = 256 * nb;
1334 const size_t local_item_size = 256;
1335
1336 real * buf = (real *) malloc(nb * sizeof(real));
1337
1338 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1339 CL_CHECK(err);
1340
1342 nb * sizeof(real), NULL, &err);
1343 CL_CHECK(err);
1344
1345 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1346 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1347 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1348
1351 0, NULL, &kern_wait));
1352
1354 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1355
1356 real res = 0.0;
1357 for (i = 0; i < nb; i++) {
1358 res += buf[i];
1359 }
1360
1361 free(buf);
1365
1366 return res;
1367}
1368
1370 cl_int err;
1372 int i;
1373
1374 if (*n <= 0) {
1375 return -((real) HUGE_VAL);
1376 }
1377
1378 if (math_program == NULL)
1380
1381 const int nb = ((*n) + 256 - 1) / 256;
1382 const size_t global_item_size = 256 * nb;
1383 const size_t local_item_size = 256;
1384
1385 real * buf = (real *) malloc(nb * sizeof(real));
1386
1387 cl_kernel kernel = clCreateKernel(math_program, "glmax_kernel", &err);
1388 CL_CHECK(err);
1389
1391 nb * sizeof(real), NULL, &err);
1392 CL_CHECK(err);
1393
1394 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1395 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1396 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1397
1400 0, NULL, &kern_wait));
1401
1403 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1404
1405 real res = buf[0];
1406 for (i = 1; i < nb; i++) {
1407 res = fmax(res, buf[i]);
1408 }
1409
1410 free(buf);
1414
1415 return res;
1416}
1417
1419 cl_int err;
1421 int i;
1422
1423 if (*n <= 0) {
1424 return (real) HUGE_VAL;
1425 }
1426
1427 if (math_program == NULL)
1429
1430 const int nb = ((*n) + 256 - 1) / 256;
1431 const size_t global_item_size = 256 * nb;
1432 const size_t local_item_size = 256;
1433
1434 real * buf = (real *) malloc(nb * sizeof(real));
1435
1436 cl_kernel kernel = clCreateKernel(math_program, "glmin_kernel", &err);
1437 CL_CHECK(err);
1438
1440 nb * sizeof(real), NULL, &err);
1441 CL_CHECK(err);
1442
1443 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1444 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1445 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1446
1449 0, NULL, &kern_wait));
1450
1452 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1453
1454 real res = buf[0];
1455 for (i = 1; i < nb; i++) {
1456 res = fmin(res, buf[i]);
1457 }
1458
1459 free(buf);
1463
1464 return res;
1465}
1466
1467
1472 cl_int err;
1473
1474 if (math_program == NULL)
1476
1477 cl_kernel kernel = clCreateKernel(math_program, "absval_kernel", &err);
1478 CL_CHECK(err);
1479
1480 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1481 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
1482
1483 const int nb = ((*n) + 256 - 1) / 256;
1484 const size_t global_item_size = 256 * nb;
1485 const size_t local_item_size = 256;
1486
1489 0, NULL, NULL));
1490}
1491
1495void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1496 cl_int err;
1497
1498 if (math_program == NULL)
1500
1501 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1502 CL_CHECK(err);
1503
1504 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1505 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1506 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1507
1508 const int nb = ((*n) + 256 - 1) / 256;
1509 const size_t global_item_size = 256 * nb;
1510 const size_t local_item_size = 256;
1511
1514 0, NULL, NULL));
1516}
1517
1522void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1523 cl_int err;
1524
1525 if (math_program == NULL)
1527
1528 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec2_kernel", &err);
1529 CL_CHECK(err);
1530
1531 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1532 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1533 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1534
1535 const int nb = ((*n) + 256 - 1) / 256;
1536 const size_t global_item_size = 256 * nb;
1537 const size_t local_item_size = 256;
1538
1541 0, NULL, NULL));
1543}
1544
1549void opencl_pwmax_vec3(void *a, void *b, void *c,
1550 int *n, cl_command_queue cmd_queue) {
1551 cl_int err;
1552
1553 if (math_program == NULL)
1555
1556 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec3_kernel", &err);
1557 CL_CHECK(err);
1558
1559 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1560 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1561 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1562 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1563
1564 const int nb = ((*n) + 256 - 1) / 256;
1565 const size_t global_item_size = 256 * nb;
1566 const size_t local_item_size = 256;
1567
1570 0, NULL, NULL));
1572}
1573
1579 cl_int err;
1580
1581 if (math_program == NULL)
1583
1584 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca2_kernel", &err);
1585 CL_CHECK(err);
1586
1587 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1588 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1589 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1590
1591 const int nb = ((*n) + 256 - 1) / 256;
1592 const size_t global_item_size = 256 * nb;
1593 const size_t local_item_size = 256;
1594
1597 0, NULL, NULL));
1599}
1600
1605void opencl_pwmax_sca3(void *a, void *b, real *c,
1606 int *n, cl_command_queue cmd_queue) {
1607 cl_int err;
1608
1609 if (math_program == NULL)
1611
1612 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca3_kernel", &err);
1613 CL_CHECK(err);
1614
1615 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1616 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1617 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1618 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1619
1620 const int nb = ((*n) + 256 - 1) / 256;
1621 const size_t global_item_size = 256 * nb;
1622 const size_t local_item_size = 256;
1623
1626 0, NULL, NULL));
1628}
1629
1634void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1635 cl_int err;
1636
1637 if (math_program == NULL)
1639
1640 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec2_kernel", &err);
1641 CL_CHECK(err);
1642
1643 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1644 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1645 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1646
1647 const int nb = ((*n) + 256 - 1) / 256;
1648 const size_t global_item_size = 256 * nb;
1649 const size_t local_item_size = 256;
1650
1653 0, NULL, NULL));
1655}
1656
1661void opencl_pwmin_vec3(void *a, void *b, void *c,
1662 int *n, cl_command_queue cmd_queue) {
1663 cl_int err;
1664
1665 if (math_program == NULL)
1667
1668 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec3_kernel", &err);
1669 CL_CHECK(err);
1670
1671 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1672 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1673 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1674 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1675
1676 const int nb = ((*n) + 256 - 1) / 256;
1677 const size_t global_item_size = 256 * nb;
1678 const size_t local_item_size = 256;
1679
1682 0, NULL, NULL));
1684}
1685
1691 cl_int err;
1692
1693 if (math_program == NULL)
1695
1696 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca2_kernel", &err);
1697 CL_CHECK(err);
1698
1699 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1700 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1701 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1702
1703 const int nb = ((*n) + 256 - 1) / 256;
1704 const size_t global_item_size = 256 * nb;
1705 const size_t local_item_size = 256;
1706
1709 0, NULL, NULL));
1711}
1712
1717void opencl_pwmin_sca3(void *a, void *b, real *c,
1718 int *n, cl_command_queue cmd_queue) {
1719 cl_int err;
1720
1721 if (math_program == NULL)
1723
1724 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca3_kernel", &err);
1725 CL_CHECK(err);
1726
1727 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1728 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1729 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1730 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1731
1732 const int nb = ((*n) + 256 - 1) / 256;
1733 const size_t global_item_size = 256 * nb;
1734 const size_t local_item_size = 256;
1735
1738 0, NULL, NULL));
1740}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1495
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:817
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:293
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:155
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1578
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:1056
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:93
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:875
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:790
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:576
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:902
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:515
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:990
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:737
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:455
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1100
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:227
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:701
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:668
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:239
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:186
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:373
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1717
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1549
real * bufred
Definition math.c:1093
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1324
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:63
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:484
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1661
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:347
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:545
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:1021
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1634
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:637
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1157
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1274
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:608
int red_s
Definition math.c:1092
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1471
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:931
cl_mem bufred_d
Definition math.c:1094
real opencl_glmin(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1418
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1522
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:215
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:54
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:846
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:428
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:960
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1690
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1605
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:763
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1224
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:319
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:401
void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:124
real opencl_glmax(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1369
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:265
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program