Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
44#include <device/opencl/jit.h>
46#include <device/opencl/check.h>
47
48#include "math_kernel.cl.h"
49
53void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
55 b, a, 0, 0, (*n) * sizeof(real),
56 0, NULL, NULL));
57}
58
62void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
64 cl_int err;
65
66 if (math_program == NULL)
68
69 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
71
72 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
73 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
74 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
75 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
76 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
77
78 const int nb = ((*n) + 256 - 1) / 256;
79 const size_t global_item_size = 256 * nb;
80 const size_t local_item_size = 256;
81
84 0, NULL, NULL));
86
87}
88
92void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
94 cl_int err;
95
96 if (math_program == NULL)
98
99 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
100 &err);
101 CL_CHECK(err);
102
103 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
104 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
105 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
106 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
107 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
108
109 const int nb = ((*n) + 256 - 1) / 256;
110 const size_t global_item_size = 256 * nb;
111 const size_t local_item_size = 256;
112
115 0, NULL, NULL));
117
118}
119
123void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n,
124 int *m, cl_command_queue cmd_queue) {
125 cl_int err;
126
127 if (math_program == NULL)
129
131 "masked_gather_copy_aligned_kernel", &err);
132 CL_CHECK(err);
133
134 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
135 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
136 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
137 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
138 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
139
140 const int nb = ((*n) + 256 - 1) / 256;
141 const size_t global_item_size = 256 * nb;
142 const size_t local_item_size = 256;
143
146 0, NULL, NULL));
148
149}
150
154void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
156 cl_int err;
157
158 if (math_program == NULL)
160
161 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
162 &err);
163 CL_CHECK(err);
164
165 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
166 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
167 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
168 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
169 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
170
171 const int nb = ((*n) + 256 - 1) / 256;
172 const size_t global_item_size = 256 * nb;
173 const size_t local_item_size = 256;
174
177 0, NULL, NULL));
179
180}
181
185void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
187 cl_int err;
188
189 if (math_program == NULL)
191
192 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
193 CL_CHECK(err);
194
195 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
196 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
197 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
198 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
199 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
200
201 const int nb = ((*mask_size) + 256 - 1) / 256;
202 const size_t global_item_size = 256 * nb;
203 const size_t local_item_size = 256;
204
207 0, NULL, NULL));
209}
210
216 real zero = 0.0;
217
219 (*n) * sizeof(real), 0, NULL, &wait_kern));
221}
222
228 real one = 1.0;
229
231 (*n) * sizeof(real), 0, NULL, &wait_kern));
233}
234
238void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
239 cl_int err;
240
241 if (math_program == NULL)
243
244 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
245 CL_CHECK(err);
246
247 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
248 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
249 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
250
251 const int nb = ((*n) + 256 - 1) / 256;
252 const size_t global_item_size = 256 * nb;
253 const size_t local_item_size = 256;
254
257 0, NULL, NULL));
259}
260
264void opencl_cmult2(void *a, void *b, real *c, int *n,
266 cl_int err;
267
268 if (math_program == NULL)
270
271 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
272 CL_CHECK(err);
273
274 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
275 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
276 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
277 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
278
279 const int nb = ((*n) + 256 - 1) / 256;
280 const size_t global_item_size = 256 * nb;
281 const size_t local_item_size = 256;
282
285 0, NULL, NULL));
287}
288
292void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
293 cl_int err;
294
295 if (math_program == NULL)
297
298 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
299 CL_CHECK(err);
300
301 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
302 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
303 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
304
305 const int nb = ((*n) + 256 - 1) / 256;
306 const size_t global_item_size = 256 * nb;
307 const size_t local_item_size = 256;
308
311 0, NULL, NULL));
313}
314
318void opencl_cdiv2(void *a, void *b, real *c, int *n,
320 cl_int err;
321
322 if (math_program == NULL)
324
325 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
326 CL_CHECK(err);
327
328 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
329 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
330 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
331 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
332
333 const int nb = ((*n) + 256 - 1) / 256;
334 const size_t global_item_size = 256 * nb;
335 const size_t local_item_size = 256;
336
339 0, NULL, NULL));
341}
342
346void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
347 cl_int err;
348
349 if (math_program == NULL)
351
352 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
353 CL_CHECK(err);
354
355 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
356 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
357 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
358
359 const int nb = ((*n) + 256 - 1) / 256;
360 const size_t global_item_size = 256 * nb;
361 const size_t local_item_size = 256;
362
365 0, NULL, NULL));
367}
368
372void opencl_cadd2(void *a, void *b, real *c, int *n,
374 cl_int err;
375
376 if (math_program == NULL)
378
379 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
380 CL_CHECK(err);
381
382 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
383 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
384 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
385 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
386
387 const int nb = ((*n) + 256 - 1) / 256;
388 const size_t global_item_size = 256 * nb;
389 const size_t local_item_size = 256;
390
393 0, NULL, NULL));
395}
396
400void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
401 cl_int err;
402
403 if (math_program == NULL)
405
406 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
407 CL_CHECK(err);
408
409 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
410 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
411 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
412
413 const int nb = ((*n) + 256 - 1) / 256;
414 const size_t global_item_size = 256 * nb;
415 const size_t local_item_size = 256;
416
419 0, NULL, NULL));
421}
422
427void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
428 cl_int err;
429
430 if (math_program == NULL)
432
433 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
434 CL_CHECK(err);
435
436 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
437 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
438 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
439
440 const int nb = ((*n) + 256 - 1) / 256;
441 const size_t global_item_size = 256 * nb;
442 const size_t local_item_size = 256;
443
446 0, NULL, NULL));
448}
449
454void opencl_add3(void *a, void *b, void *c, int *n,
456 cl_int err;
457
458 if (math_program == NULL)
460
461 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
462 CL_CHECK(err);
463
464 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
465 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
466 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
467 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
468
469 const int nb = ((*n) + 256 - 1) / 256;
470 const size_t global_item_size = 256 * nb;
471 const size_t local_item_size = 256;
472
475 0, NULL, NULL));
477}
478
483void opencl_add4(void *a, void *b, void *c, void *d, int *n,
485 cl_int err;
486
487 if (math_program == NULL)
489
490 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
491 CL_CHECK(err);
492
493 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
494 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
495 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
496 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
497 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
498
499 const int nb = ((*n) + 256 - 1) / 256;
500 const size_t global_item_size = 256 * nb;
501 const size_t local_item_size = 256;
502
505 0, NULL, NULL));
507}
508
514void opencl_add2s1(void *a, void *b, real *c1, int *n,
516 cl_int err;
517
518 if (math_program == NULL)
520
521 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
522 CL_CHECK(err);
523
524 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
525 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
526 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
527 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
528
529 const int nb = ((*n) + 256 - 1) / 256;
530 const size_t global_item_size = 256 * nb;
531 const size_t local_item_size = 256;
532
535 0, NULL, NULL));
537}
538
544void opencl_add2s2(void *a, void *b, real *c1, int *n,
546 cl_int err;
547
548 if (math_program == NULL)
550
551 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
552 CL_CHECK(err);
553
554 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
555 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
556 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
557 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
558
559 const int nb = ((*n) + 256 - 1) / 256;
560 const size_t global_item_size = 256 * nb;
561 const size_t local_item_size = 256;
562
565 0, NULL, NULL));
567}
568
575void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
577 cl_int err;
578
579 if (math_program == NULL)
581
582 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
583 CL_CHECK(err);
584
585 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
586 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
587 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
588 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
589 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
590
591 const int nb = ((*n) + 256 - 1) / 256;
592 const size_t global_item_size = 256 * nb;
593 const size_t local_item_size = 256;
594
597 0, NULL, NULL));
599
600}
601
607void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
609 cl_int err;
610
611 if (math_program == NULL)
613
614 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
615 CL_CHECK(err);
616
617 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
618 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
619 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
620 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
621
622 const int nb = ((*n) + 256 - 1) / 256;
623 const size_t global_item_size = 256 * nb;
624 const size_t local_item_size = 256;
625
628 0, NULL, NULL));
630}
631
636void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
638 cl_int err;
639
640 if (math_program == NULL)
642
643 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
644 CL_CHECK(err);
645
646 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
647 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
648 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
649 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
650 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
651 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
652
653 const int nb = ((*n) + 256 - 1) / 256;
654 const size_t global_item_size = 256 * nb;
655 const size_t local_item_size = 256;
656
659 0, NULL, NULL));
661}
662
667void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
668 real *c3, int *n, cl_command_queue cmd_queue) {
669 cl_int err;
670
671 if (math_program == NULL)
673
674 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
675 CL_CHECK(err);
676
677 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
678 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
679 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
680 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
681 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
682 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
683 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
684 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
685
686 const int nb = ((*n) + 256 - 1) / 256;
687 const size_t global_item_size = 256 * nb;
688 const size_t local_item_size = 256;
689
692 0, NULL, NULL));
694}
695
700void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
701 real *c2, real *c3, real * c4, int *n,
703 cl_int err;
704
705 if (math_program == NULL)
707
708 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
709 CL_CHECK(err);
710
711 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
712 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
713 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
714 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
715 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
716 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
717 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
718 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
719 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
720 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
721
722 const int nb = ((*n) + 256 - 1) / 256;
723 const size_t global_item_size = 256 * nb;
724 const size_t local_item_size = 256;
725
728 0, NULL, NULL));
730}
731
737 cl_int err;
738
739 if (math_program == NULL)
741
742 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
743 CL_CHECK(err);
744
745 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
746 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
747
748 const int nb = ((*n) + 256 - 1) / 256;
749 const size_t global_item_size = 256 * nb;
750 const size_t local_item_size = 256;
751
754 0, NULL, NULL));
756}
757
762void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
763 cl_int err;
764
765 if (math_program == NULL)
767
768 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
769 CL_CHECK(err);
770
771 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
772 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
773 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
774
775 const int nb = ((*n) + 256 - 1) / 256;
776 const size_t global_item_size = 256 * nb;
777 const size_t local_item_size = 256;
778
781 0, NULL, NULL));
783}
784
789void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
790 cl_int err;
791
792 if (math_program == NULL)
794
795 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
796 CL_CHECK(err);
797
798 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
799 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
800 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
801
802 const int nb = ((*n) + 256 - 1) / 256;
803 const size_t global_item_size = 256 * nb;
804 const size_t local_item_size = 256;
805
808 0, NULL, NULL));
810}
811
816void opencl_col3(void *a, void *b, void *c, int *n,
818 cl_int err;
819
820 if (math_program == NULL)
822
823 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
824 CL_CHECK(err);
825
826 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
827 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
828 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
829 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
830
831 const int nb = ((*n) + 256 - 1) / 256;
832 const size_t global_item_size = 256 * nb;
833 const size_t local_item_size = 256;
834
837 0, NULL, NULL));
839}
840
845void opencl_subcol3(void *a, void *b, void *c, int *n,
847 cl_int err;
848
849 if (math_program == NULL)
851
852 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
853 CL_CHECK(err);
854
855 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
856 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
857 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
858 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
859
860 const int nb = ((*n) + 256 - 1) / 256;
861 const size_t global_item_size = 256 * nb;
862 const size_t local_item_size = 256;
863
866 0, NULL, NULL));
868}
869
874void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
875 cl_int err;
876
877 if (math_program == NULL)
879
880 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
881 CL_CHECK(err);
882
883 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
884 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
885 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
886
887 const int nb = ((*n) + 256 - 1) / 256;
888 const size_t global_item_size = 256 * nb;
889 const size_t local_item_size = 256;
890
893 0, NULL, NULL));
895}
896
901void opencl_sub3(void *a, void *b, void *c, int *n,
903 cl_int err;
904
905 if (math_program == NULL)
907
908 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
909 CL_CHECK(err);
910
911 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
912 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
913 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
914 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
915
916 const int nb = ((*n) + 256 - 1) / 256;
917 const size_t global_item_size = 256 * nb;
918 const size_t local_item_size = 256;
919
922 0, NULL, NULL));
924}
925
930void opencl_addcol3(void *a, void *b, void *c, int *n,
932 cl_int err;
933
934 if (math_program == NULL)
936
937 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
938 CL_CHECK(err);
939
940 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
941 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
942 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
943 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
944
945 const int nb = ((*n) + 256 - 1) / 256;
946 const size_t global_item_size = 256 * nb;
947 const size_t local_item_size = 256;
948
951 0, NULL, NULL));
953}
954
959void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
961 cl_int err;
962
963 if (math_program == NULL)
965
966 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
967 CL_CHECK(err);
968
969 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
970 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
971 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
972 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
973 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
974
975 const int nb = ((*n) + 256 - 1) / 256;
976 const size_t global_item_size = 256 * nb;
977 const size_t local_item_size = 256;
978
981 0, NULL, NULL));
983}
984
989void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
991 cl_int err;
992
993 if (math_program == NULL)
995
996 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
997 CL_CHECK(err);
998
999 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1000 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1001 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1002 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
1003 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1004
1005 const int nb = ((*n) + 256 - 1) / 256;
1006 const size_t global_item_size = 256 * nb;
1007 const size_t local_item_size = 256;
1008
1011 0, NULL, NULL));
1013}
1014
1020void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
1021 void *v1, void *v2, void *v3, int *n,
1023 cl_int err;
1024
1025 if (math_program == NULL)
1027
1028 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
1029 CL_CHECK(err);
1030
1031 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
1032 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
1033 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
1034 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
1035 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
1036 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
1037 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
1038 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
1039
1040 const int nb = ((*n) + 256 - 1) / 256;
1041 const size_t global_item_size = 256 * nb;
1042 const size_t local_item_size = 256;
1043
1046 0, NULL, NULL));
1048}
1049
1055void opencl_vcross(void *u1, void *u2, void *u3,
1056 void *v1, void *v2, void *v3,
1057 void *w1, void *w2, void *w3,
1058 int *n, cl_command_queue cmd_queue) {
1059
1060 cl_int err;
1061
1062 if (math_program == NULL)
1064
1065 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1066 CL_CHECK(err);
1067
1068 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1069 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1070 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1071 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1072 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1073 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1074 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1075 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1076 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1077 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1078
1079 const int nb = ((*n) + 256 - 1) / 256;
1080 const size_t global_item_size = 256 * nb;
1081 const size_t local_item_size = 256;
1082
1085 0, NULL, NULL));
1087
1088}
1089
1091int red_s = 0;
1094
1099real opencl_glsc3(void *a, void *b, void *c, int *n,
1101 cl_int err;
1103 int i;
1104
1105 if (math_program == NULL)
1107
1108 const int nb = ((*n) + 256 - 1) / 256;
1109 const size_t global_item_size = 256 * nb;
1110 const size_t local_item_size = 256;
1111
1112 if ( nb > red_s){
1113 red_s = nb;
1114 if (bufred != NULL) {
1115 free(bufred);
1117 }
1118 bufred = (real *) malloc(nb * sizeof(real));
1119
1121 nb * sizeof(real), NULL, &err);
1122 CL_CHECK(err);
1123 }
1124
1125 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1126 CL_CHECK(err);
1127
1128 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1129 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1130 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1131 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1132 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1133
1136 0, NULL, &kern_wait));
1137
1139 nb * sizeof(real), bufred, 1,
1140 &kern_wait, NULL));
1141
1142 real res = 0.0;
1143 for (i = 0; i < nb; i++) {
1144 res += bufred[i];
1145 }
1146
1148
1149 return res;
1150}
1151
1156void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1158 int i, k;
1159 cl_int err;
1161
1162 if (math_program == NULL)
1164
1165 int pow2 = 1;
1166 while(pow2 < (*j)){
1167 pow2 = 2*pow2;
1168 }
1169
1170 const int nt = 256 / pow2;
1171 const int nb = ((*n) + nt - 1) / nt;
1172 const size_t local_item_size[2] = {nt, pow2};
1173 const size_t global_item_size[2] = {nb * nt, pow2};
1174
1175 if((*j)*nb > red_s) {
1176 red_s = (*j)*nb;
1177 if (bufred != NULL) {
1178 free(bufred);
1180 }
1181 bufred = (real *) malloc((*j) * nb * sizeof(real));
1182
1184 (*j) * nb * sizeof(real), NULL, &err);
1185 CL_CHECK(err);
1186 }
1187
1188 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1189 CL_CHECK(err);
1190
1191 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1192 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1193 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1194 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1195 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1196 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1197
1200 0, NULL, &kern_wait));
1201
1203 (*j) * nb * sizeof(real),
1204 bufred, 1, &kern_wait, NULL));
1205
1206 for (k = 0; k < (*j); k++) {
1207 h[k] = 0.0;
1208 }
1209
1210 for (i = 0; i < nb; i++) {
1211 for (k = 0; k < (*j); k++) {
1212 h[k] += bufred[i*(*j)+k];
1213 }
1214 }
1215
1217}
1218
1224 cl_int err;
1226 int i;
1227
1228 if (math_program == NULL)
1230
1231 const int nb = ((*n) + 256 - 1) / 256;
1232 const size_t global_item_size = 256 * nb;
1233 const size_t local_item_size = 256;
1234
1235 real * buf = (real *) malloc(nb * sizeof(real));
1236
1237 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1238 CL_CHECK(err);
1239
1241 nb * sizeof(real), NULL, &err);
1242 CL_CHECK(err);
1243
1244 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1245 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1246 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1247 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1248
1251 0, NULL, &kern_wait));
1252
1254 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1255
1256 real res = 0.0;
1257 for (i = 0; i < nb; i++) {
1258 res += buf[i];
1259 }
1260
1261 free(buf);
1264
1265 return res;
1266}
1267
1273 cl_int err;
1275 int i;
1276
1277 if (math_program == NULL)
1279
1280 const int nb = ((*n) + 256 - 1) / 256;
1281 const size_t global_item_size = 256 * nb;
1282 const size_t local_item_size = 256;
1283
1284 real * buf = (real *) malloc(nb * sizeof(real));
1285
1286 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1287 CL_CHECK(err);
1288
1290 nb * sizeof(real), NULL, &err);
1291 CL_CHECK(err);
1292
1293 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1294 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1295 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1296 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1297
1300 0, NULL, &kern_wait));
1301
1303 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1304
1305 real res = 0.0;
1306 for (i = 0; i < nb; i++) {
1307 res += buf[i];
1308 }
1309
1310 free(buf);
1313
1314 return res;
1315}
1316
1322 cl_int err;
1324 int i;
1325
1326 if (math_program == NULL)
1328
1329 const int nb = ((*n) + 256 - 1) / 256;
1330 const size_t global_item_size = 256 * nb;
1331 const size_t local_item_size = 256;
1332
1333 real * buf = (real *) malloc(nb * sizeof(real));
1334
1335 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1336 CL_CHECK(err);
1337
1339 nb * sizeof(real), NULL, &err);
1340 CL_CHECK(err);
1341
1342 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1343 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1344 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1345
1348 0, NULL, &kern_wait));
1349
1351 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1352
1353 real res = 0.0;
1354 for (i = 0; i < nb; i++) {
1355 res += buf[i];
1356 }
1357
1358 free(buf);
1361
1362 return res;
1363}
1364
1365
1370 cl_int err;
1371
1372 if (math_program == NULL)
1374
1375 cl_kernel kernel = clCreateKernel(math_program, "absval_kernel", &err);
1376 CL_CHECK(err);
1377
1378 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1379 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
1380
1381 const int nb = ((*n) + 256 - 1) / 256;
1382 const size_t global_item_size = 256 * nb;
1383 const size_t local_item_size = 256;
1384
1387 0, NULL, NULL));
1388}
1389
1393void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1394 cl_int err;
1395
1396 if (math_program == NULL)
1398
1399 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1400 CL_CHECK(err);
1401
1402 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1403 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1404 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1405
1406 const int nb = ((*n) + 256 - 1) / 256;
1407 const size_t global_item_size = 256 * nb;
1408 const size_t local_item_size = 256;
1409
1412 0, NULL, NULL));
1414}
1415
1420void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1421 cl_int err;
1422
1423 if (math_program == NULL)
1425
1426 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec2_kernel", &err);
1427 CL_CHECK(err);
1428
1429 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1430 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1431 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1432
1433 const int nb = ((*n) + 256 - 1) / 256;
1434 const size_t global_item_size = 256 * nb;
1435 const size_t local_item_size = 256;
1436
1439 0, NULL, NULL));
1441}
1442
1447void opencl_pwmax_vec3(void *a, void *b, void *c,
1448 int *n, cl_command_queue cmd_queue) {
1449 cl_int err;
1450
1451 if (math_program == NULL)
1453
1454 cl_kernel kernel = clCreateKernel(math_program, "pwmax_vec3_kernel", &err);
1455 CL_CHECK(err);
1456
1457 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1458 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1459 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1460 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1461
1462 const int nb = ((*n) + 256 - 1) / 256;
1463 const size_t global_item_size = 256 * nb;
1464 const size_t local_item_size = 256;
1465
1468 0, NULL, NULL));
1470}
1471
1477 cl_int err;
1478
1479 if (math_program == NULL)
1481
1482 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca2_kernel", &err);
1483 CL_CHECK(err);
1484
1485 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1486 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1487 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1488
1489 const int nb = ((*n) + 256 - 1) / 256;
1490 const size_t global_item_size = 256 * nb;
1491 const size_t local_item_size = 256;
1492
1495 0, NULL, NULL));
1497}
1498
1503void opencl_pwmax_sca3(void *a, void *b, real *c,
1504 int *n, cl_command_queue cmd_queue) {
1505 cl_int err;
1506
1507 if (math_program == NULL)
1509
1510 cl_kernel kernel = clCreateKernel(math_program, "pwmax_sca3_kernel", &err);
1511 CL_CHECK(err);
1512
1513 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1514 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1515 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1516 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1517
1518 const int nb = ((*n) + 256 - 1) / 256;
1519 const size_t global_item_size = 256 * nb;
1520 const size_t local_item_size = 256;
1521
1524 0, NULL, NULL));
1526}
1527
1532void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
1533 cl_int err;
1534
1535 if (math_program == NULL)
1537
1538 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec2_kernel", &err);
1539 CL_CHECK(err);
1540
1541 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1542 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1543 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1544
1545 const int nb = ((*n) + 256 - 1) / 256;
1546 const size_t global_item_size = 256 * nb;
1547 const size_t local_item_size = 256;
1548
1551 0, NULL, NULL));
1553}
1554
1559void opencl_pwmin_vec3(void *a, void *b, void *c,
1560 int *n, cl_command_queue cmd_queue) {
1561 cl_int err;
1562
1563 if (math_program == NULL)
1565
1566 cl_kernel kernel = clCreateKernel(math_program, "pwmin_vec3_kernel", &err);
1567 CL_CHECK(err);
1568
1569 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1570 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1571 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1572 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1573
1574 const int nb = ((*n) + 256 - 1) / 256;
1575 const size_t global_item_size = 256 * nb;
1576 const size_t local_item_size = 256;
1577
1580 0, NULL, NULL));
1582}
1583
1589 cl_int err;
1590
1591 if (math_program == NULL)
1593
1594 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca2_kernel", &err);
1595 CL_CHECK(err);
1596
1597 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1598 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
1599 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1600
1601 const int nb = ((*n) + 256 - 1) / 256;
1602 const size_t global_item_size = 256 * nb;
1603 const size_t local_item_size = 256;
1604
1607 0, NULL, NULL));
1609}
1610
1615void opencl_pwmin_sca3(void *a, void *b, real *c,
1616 int *n, cl_command_queue cmd_queue) {
1617 cl_int err;
1618
1619 if (math_program == NULL)
1621
1622 cl_kernel kernel = clCreateKernel(math_program, "pwmin_sca3_kernel", &err);
1623 CL_CHECK(err);
1624
1625 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1626 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1627 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
1628 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1629
1630 const int nb = ((*n) + 256 - 1) / 256;
1631 const size_t global_item_size = 256 * nb;
1632 const size_t local_item_size = 256;
1633
1636 0, NULL, NULL));
1638}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1393
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:816
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:292
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:154
void opencl_pwmax_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1476
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:1055
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:92
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:874
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:789
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:575
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:901
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:514
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:989
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:736
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:454
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1099
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:226
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:700
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:667
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:238
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:185
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:372
void opencl_pwmin_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1615
void opencl_pwmax_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1447
real * bufred
Definition math.c:1092
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1321
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:62
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:483
void opencl_pwmin_vec3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1559
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:346
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:544
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:1020
void opencl_pwmin_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1532
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:636
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1156
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1272
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:607
int red_s
Definition math.c:1091
void opencl_absval(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1369
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:930
cl_mem bufred_d
Definition math.c:1093
void opencl_pwmax_vec2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1420
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:214
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:53
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:845
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:427
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:959
void opencl_pwmin_sca2(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1588
void opencl_pwmax_sca3(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1503
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:762
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1223
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:318
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:400
void opencl_masked_gather_copy_aligned(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:123
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:264
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program