Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
44#include <device/opencl/jit.h>
46#include <device/opencl/check.h>
47
48#include "math_kernel.cl.h"
49
53void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
55 b, a, 0, 0, (*n) * sizeof(real),
56 0, NULL, NULL));
57}
58
62void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
64 cl_int err;
65
66 if (math_program == NULL)
68
69 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
71
72 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
73 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
74 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
75 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
76 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
77
78 const int nb = ((*n) + 256 - 1) / 256;
79 const size_t global_item_size = 256 * nb;
80 const size_t local_item_size = 256;
81
84 0, NULL, NULL));
85
86}
87
91void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
93 cl_int err;
94
95 if (math_program == NULL)
97
98 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
99 &err);
100 CL_CHECK(err);
101
102 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
103 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
104 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
105 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
106 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
107
108 const int nb = ((*n) + 256 - 1) / 256;
109 const size_t global_item_size = 256 * nb;
110 const size_t local_item_size = 256;
111
114 0, NULL, NULL));
115
116}
117
121void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
123 cl_int err;
124
125 if (math_program == NULL)
127
128 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
129 &err);
130 CL_CHECK(err);
131
132 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
133 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
134 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
135 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
136 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
137
138 const int nb = ((*n) + 256 - 1) / 256;
139 const size_t global_item_size = 256 * nb;
140 const size_t local_item_size = 256;
141
144 0, NULL, NULL));
145
146}
147
151void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
153 cl_int err;
154
155 if (math_program == NULL)
157
158 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
159 CL_CHECK(err);
160
161 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
162 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
163 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
164 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
165 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
166
167 const int nb = ((*mask_size) + 256 - 1) / 256;
168 const size_t global_item_size = 256 * nb;
169 const size_t local_item_size = 256;
170
173 0, NULL, NULL));
174 }
175
181 real zero = 0.0;
182
184 (*n) * sizeof(real), 0, NULL, &wait_kern));
186}
187
193 real one = 1.0;
194
196 (*n) * sizeof(real), 0, NULL, &wait_kern));
198}
199
203void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
204 cl_int err;
205
206 if (math_program == NULL)
208
209 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
210 CL_CHECK(err);
211
212 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
213 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
214 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
215
216 const int nb = ((*n) + 256 - 1) / 256;
217 const size_t global_item_size = 256 * nb;
218 const size_t local_item_size = 256;
219
222 0, NULL, NULL));
223}
224
228void opencl_cmult2(void *a, void *b, real *c, int *n,
230 cl_int err;
231
232 if (math_program == NULL)
234
235 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
236 CL_CHECK(err);
237
238 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
239 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
240 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
241 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
242
243 const int nb = ((*n) + 256 - 1) / 256;
244 const size_t global_item_size = 256 * nb;
245 const size_t local_item_size = 256;
246
249 0, NULL, NULL));
250}
251
255void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
256 cl_int err;
257
258 if (math_program == NULL)
260
261 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
262 CL_CHECK(err);
263
264 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
265 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
266 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
267
268 const int nb = ((*n) + 256 - 1) / 256;
269 const size_t global_item_size = 256 * nb;
270 const size_t local_item_size = 256;
271
274 0, NULL, NULL));
275}
276
280void opencl_cdiv2(void *a, void *b, real *c, int *n,
282 cl_int err;
283
284 if (math_program == NULL)
286
287 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
288 CL_CHECK(err);
289
290 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
291 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
292 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
293 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
294
295 const int nb = ((*n) + 256 - 1) / 256;
296 const size_t global_item_size = 256 * nb;
297 const size_t local_item_size = 256;
298
301 0, NULL, NULL));
302}
303
307void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
308 cl_int err;
309
310 if (math_program == NULL)
312
313 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
314 CL_CHECK(err);
315
316 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
317 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
318 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
319
320 const int nb = ((*n) + 256 - 1) / 256;
321 const size_t global_item_size = 256 * nb;
322 const size_t local_item_size = 256;
323
326 0, NULL, NULL));
327}
328
332void opencl_cadd2(void *a, void *b, real *c, int *n,
334 cl_int err;
335
336 if (math_program == NULL)
338
339 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
340 CL_CHECK(err);
341
342 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
343 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
344 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
345 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
346
347 const int nb = ((*n) + 256 - 1) / 256;
348 const size_t global_item_size = 256 * nb;
349 const size_t local_item_size = 256;
350
353 0, NULL, NULL));
354}
355
359void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
360 cl_int err;
361
362 if (math_program == NULL)
364
365 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
366 CL_CHECK(err);
367
368 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
369 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
370 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
371
372 const int nb = ((*n) + 256 - 1) / 256;
373 const size_t global_item_size = 256 * nb;
374 const size_t local_item_size = 256;
375
378 0, NULL, NULL));
379}
380
385void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
386 cl_int err;
387
388 if (math_program == NULL)
390
391 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
392 CL_CHECK(err);
393
394 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
395 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
396 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
397
398 const int nb = ((*n) + 256 - 1) / 256;
399 const size_t global_item_size = 256 * nb;
400 const size_t local_item_size = 256;
401
404 0, NULL, NULL));
405}
406
411void opencl_add3(void *a, void *b, void *c, int *n,
413 cl_int err;
414
415 if (math_program == NULL)
417
418 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
419 CL_CHECK(err);
420
421 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
422 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
423 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
424 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
425
426 const int nb = ((*n) + 256 - 1) / 256;
427 const size_t global_item_size = 256 * nb;
428 const size_t local_item_size = 256;
429
432 0, NULL, NULL));
433}
434
439void opencl_add4(void *a, void *b, void *c, void *d, int *n,
441 cl_int err;
442
443 if (math_program == NULL)
445
446 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
447 CL_CHECK(err);
448
449 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
450 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
451 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
452 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
453 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
454
455 const int nb = ((*n) + 256 - 1) / 256;
456 const size_t global_item_size = 256 * nb;
457 const size_t local_item_size = 256;
458
461 0, NULL, NULL));
462}
463
469void opencl_add2s1(void *a, void *b, real *c1, int *n,
471 cl_int err;
472
473 if (math_program == NULL)
475
476 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
477 CL_CHECK(err);
478
479 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
480 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
481 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
482 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
483
484 const int nb = ((*n) + 256 - 1) / 256;
485 const size_t global_item_size = 256 * nb;
486 const size_t local_item_size = 256;
487
490 0, NULL, NULL));
491}
492
498void opencl_add2s2(void *a, void *b, real *c1, int *n,
500 cl_int err;
501
502 if (math_program == NULL)
504
505 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
506 CL_CHECK(err);
507
508 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
509 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
510 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
511 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
512
513 const int nb = ((*n) + 256 - 1) / 256;
514 const size_t global_item_size = 256 * nb;
515 const size_t local_item_size = 256;
516
519 0, NULL, NULL));
520}
521
528void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
530 cl_int err;
531
532 if (math_program == NULL)
534
535 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
536 CL_CHECK(err);
537
538 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
539 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
540 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
541 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
542 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
543
544 const int nb = ((*n) + 256 - 1) / 256;
545 const size_t global_item_size = 256 * nb;
546 const size_t local_item_size = 256;
547
550 0, NULL, NULL));
551
552}
553
559void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
561 cl_int err;
562
563 if (math_program == NULL)
565
566 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
567 CL_CHECK(err);
568
569 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
570 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
571 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
572 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
573
574 const int nb = ((*n) + 256 - 1) / 256;
575 const size_t global_item_size = 256 * nb;
576 const size_t local_item_size = 256;
577
580 0, NULL, NULL));
581}
582
587void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
589 cl_int err;
590
591 if (math_program == NULL)
593
594 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
595 CL_CHECK(err);
596
597 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
598 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
599 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
600 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
601 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
602 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
603
604 const int nb = ((*n) + 256 - 1) / 256;
605 const size_t global_item_size = 256 * nb;
606 const size_t local_item_size = 256;
607
610 0, NULL, NULL));
611}
612
618 cl_int err;
619
620 if (math_program == NULL)
622
623 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
624 CL_CHECK(err);
625
626 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
627 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
628
629 const int nb = ((*n) + 256 - 1) / 256;
630 const size_t global_item_size = 256 * nb;
631 const size_t local_item_size = 256;
632
635 0, NULL, NULL));
636}
637
642void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
643 cl_int err;
644
645 if (math_program == NULL)
647
648 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
649 CL_CHECK(err);
650
651 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
652 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
653 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
654
655 const int nb = ((*n) + 256 - 1) / 256;
656 const size_t global_item_size = 256 * nb;
657 const size_t local_item_size = 256;
658
661 0, NULL, NULL));
662}
663
668void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
669 cl_int err;
670
671 if (math_program == NULL)
673
674 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
675 CL_CHECK(err);
676
677 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
678 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
679 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
680
681 const int nb = ((*n) + 256 - 1) / 256;
682 const size_t global_item_size = 256 * nb;
683 const size_t local_item_size = 256;
684
687 0, NULL, NULL));
688}
689
694void opencl_col3(void *a, void *b, void *c, int *n,
696 cl_int err;
697
698 if (math_program == NULL)
700
701 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
702 CL_CHECK(err);
703
704 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
705 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
706 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
707 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
708
709 const int nb = ((*n) + 256 - 1) / 256;
710 const size_t global_item_size = 256 * nb;
711 const size_t local_item_size = 256;
712
715 0, NULL, NULL));
716}
717
722void opencl_subcol3(void *a, void *b, void *c, int *n,
724 cl_int err;
725
726 if (math_program == NULL)
728
729 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
730 CL_CHECK(err);
731
732 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
733 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
734 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
735 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
736
737 const int nb = ((*n) + 256 - 1) / 256;
738 const size_t global_item_size = 256 * nb;
739 const size_t local_item_size = 256;
740
743 0, NULL, NULL));
744}
745
750void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
751 cl_int err;
752
753 if (math_program == NULL)
755
756 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
757 CL_CHECK(err);
758
759 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
760 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
761 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
762
763 const int nb = ((*n) + 256 - 1) / 256;
764 const size_t global_item_size = 256 * nb;
765 const size_t local_item_size = 256;
766
769 0, NULL, NULL));
770}
771
776void opencl_sub3(void *a, void *b, void *c, int *n,
778 cl_int err;
779
780 if (math_program == NULL)
782
783 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
784 CL_CHECK(err);
785
786 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
787 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
788 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
789 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
790
791 const int nb = ((*n) + 256 - 1) / 256;
792 const size_t global_item_size = 256 * nb;
793 const size_t local_item_size = 256;
794
797 0, NULL, NULL));
798}
799
804void opencl_addcol3(void *a, void *b, void *c, int *n,
806 cl_int err;
807
808 if (math_program == NULL)
810
811 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
812 CL_CHECK(err);
813
814 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
815 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
816 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
817 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
818
819 const int nb = ((*n) + 256 - 1) / 256;
820 const size_t global_item_size = 256 * nb;
821 const size_t local_item_size = 256;
822
825 0, NULL, NULL));
826}
827
832void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
834 cl_int err;
835
836 if (math_program == NULL)
838
839 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
840 CL_CHECK(err);
841
842 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
843 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
844 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
845 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
846 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
847
848 const int nb = ((*n) + 256 - 1) / 256;
849 const size_t global_item_size = 256 * nb;
850 const size_t local_item_size = 256;
851
854 0, NULL, NULL));
855}
856
862void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
863 void *v1, void *v2, void *v3, int *n,
865 cl_int err;
866
867 if (math_program == NULL)
869
870 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
871 CL_CHECK(err);
872
873 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
874 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
875 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
876 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
877 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
878 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
879 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
880 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
881
882 const int nb = ((*n) + 256 - 1) / 256;
883 const size_t global_item_size = 256 * nb;
884 const size_t local_item_size = 256;
885
888 0, NULL, NULL));
889}
890
896void opencl_vcross(void *u1, void *u2, void *u3,
897 void *v1, void *v2, void *v3,
898 void *w1, void *w2, void *w3,
899 int *n, cl_command_queue cmd_queue) {
900
901 cl_int err;
902
903 if (math_program == NULL)
905
906 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
907 CL_CHECK(err);
908
909 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
910 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
911 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
912 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
913 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
914 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
915 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
916 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
917 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
918 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
919
920 const int nb = ((*n) + 256 - 1) / 256;
921 const size_t global_item_size = 256 * nb;
922 const size_t local_item_size = 256;
923
926 0, NULL, NULL));
927
928}
929
931int red_s = 0;
934
939real opencl_glsc3(void *a, void *b, void *c, int *n,
941 cl_int err;
943 int i;
944
945 if (math_program == NULL)
947
948 const int nb = ((*n) + 256 - 1) / 256;
949 const size_t global_item_size = 256 * nb;
950 const size_t local_item_size = 256;
951
952 if ( nb > red_s){
953 red_s = nb;
954 if (bufred != NULL) {
955 free(bufred);
957 }
958 bufred = (real *) malloc(nb * sizeof(real));
959
961 nb * sizeof(real), NULL, &err);
962 CL_CHECK(err);
963 }
964
965 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
966 CL_CHECK(err);
967
968 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
969 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
970 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
971 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
972 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
973
976 0, NULL, &kern_wait));
977
979 nb * sizeof(real), bufred, 1,
980 &kern_wait, NULL));
981
982 real res = 0.0;
983 for (i = 0; i < nb; i++) {
984 res += bufred[i];
985 }
986
987 return res;
988}
989
994void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
996 int i, k;
997 cl_int err;
999
1000 if (math_program == NULL)
1002
1003 int pow2 = 1;
1004 while(pow2 < (*j)){
1005 pow2 = 2*pow2;
1006 }
1007
1008 const int nt = 256 / pow2;
1009 const int nb = ((*n) + nt - 1) / nt;
1010 const size_t local_item_size[2] = {nt, pow2};
1011 const size_t global_item_size[2] = {nb * nt, pow2};
1012
1013 if((*j)*nb > red_s) {
1014 red_s = (*j)*nb;
1015 if (bufred != NULL) {
1016 free(bufred);
1018 }
1019 bufred = (real *) malloc((*j) * nb * sizeof(real));
1020
1022 (*j) * nb * sizeof(real), NULL, &err);
1023 CL_CHECK(err);
1024 }
1025
1026 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1027 CL_CHECK(err);
1028
1029 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1030 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1031 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1032 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1033 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1034 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1035
1038 0, NULL, &kern_wait));
1039
1041 (*j) * nb * sizeof(real),
1042 bufred, 1, &kern_wait, NULL));
1043
1044 for (k = 0; k < (*j); k++) {
1045 h[k] = 0.0;
1046 }
1047
1048 for (i = 0; i < nb; i++) {
1049 for (k = 0; k < (*j); k++) {
1050 h[k] += bufred[i*(*j)+k];
1051 }
1052 }
1053}
1054
1060 cl_int err;
1062 int i;
1063
1064 if (math_program == NULL)
1066
1067 const int nb = ((*n) + 256 - 1) / 256;
1068 const size_t global_item_size = 256 * nb;
1069 const size_t local_item_size = 256;
1070
1071 real * buf = (real *) malloc(nb * sizeof(real));
1072
1073 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1074 CL_CHECK(err);
1075
1077 nb * sizeof(real), NULL, &err);
1078 CL_CHECK(err);
1079
1080 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1081 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1082 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1083 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1084
1087 0, NULL, &kern_wait));
1088
1089
1091 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1092
1093 real res = 0.0;
1094 for (i = 0; i < nb; i++) {
1095 res += buf[i];
1096 }
1097
1098 free(buf);
1100
1101 return res;
1102}
1103
1109 cl_int err;
1111 int i;
1112
1113 if (math_program == NULL)
1115
1116 const int nb = ((*n) + 256 - 1) / 256;
1117 const size_t global_item_size = 256 * nb;
1118 const size_t local_item_size = 256;
1119
1120 real * buf = (real *) malloc(nb * sizeof(real));
1121
1122 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1123 CL_CHECK(err);
1124
1126 nb * sizeof(real), NULL, &err);
1127 CL_CHECK(err);
1128
1129 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1130 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1131 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1132 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1133
1136 0, NULL, &kern_wait));
1137
1138
1140 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1141
1142 real res = 0.0;
1143 for (i = 0; i < nb; i++) {
1144 res += buf[i];
1145 }
1146
1147 free(buf);
1149
1150 return res;
1151}
1152
1158 cl_int err;
1160 int i;
1161
1162 if (math_program == NULL)
1164
1165 const int nb = ((*n) + 256 - 1) / 256;
1166 const size_t global_item_size = 256 * nb;
1167 const size_t local_item_size = 256;
1168
1169 real * buf = (real *) malloc(nb * sizeof(real));
1170
1171 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1172 CL_CHECK(err);
1173
1175 nb * sizeof(real), NULL, &err);
1176 CL_CHECK(err);
1177
1178 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1179 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1180 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1181
1184 0, NULL, &kern_wait));
1185
1186
1188 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1189
1190 real res = 0.0;
1191 for (i = 0; i < nb; i++) {
1192 res += buf[i];
1193 }
1194
1195 free(buf);
1197
1198 return res;
1199}
1200
1201
1205void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1206 cl_int err;
1207
1208 if (math_program == NULL)
1210
1211 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1212 CL_CHECK(err);
1213
1214 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1215 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1216 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1217
1218 const int nb = ((*n) + 256 - 1) / 256;
1219 const size_t global_item_size = 256 * nb;
1220 const size_t local_item_size = 256;
1221
1224 0, NULL, NULL));
1225}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1205
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:694
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:255
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:121
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:896
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:91
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:750
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:668
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:528
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:776
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:469
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:617
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:411
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:939
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:191
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:203
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:151
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:332
real * bufred
Definition math.c:932
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1157
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:62
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:439
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:307
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:498
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:862
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:587
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:994
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1108
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:559
int red_s
Definition math.c:931
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:804
cl_mem bufred_d
Definition math.c:933
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:179
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:53
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:722
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:385
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:832
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:642
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1059
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:280
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:359
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:228
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program