Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
44#include <device/opencl/jit.h>
46#include <device/opencl/check.h>
47
48#include "math_kernel.cl.h"
49
53void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue) {
55 b, a, 0, 0, (*n) * sizeof(real),
56 0, NULL, NULL));
57}
58
62void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m,
64 cl_int err;
65
66 if (math_program == NULL)
68
69 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
71
72 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
73 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
74 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
75 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
76 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
77
78 const int nb = ((*n) + 256 - 1) / 256;
79 const size_t global_item_size = 256 * nb;
80 const size_t local_item_size = 256;
81
84 0, NULL, NULL));
85
86}
87
91void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m,
93 cl_int err;
94
95 if (math_program == NULL)
97
98 cl_kernel kernel = clCreateKernel(math_program, "masked_gather_copy_kernel",
99 &err);
100 CL_CHECK(err);
101
102 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
103 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
104 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
105 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
106 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
107
108 const int nb = ((*n) + 256 - 1) / 256;
109 const size_t global_item_size = 256 * nb;
110 const size_t local_item_size = 256;
111
114 0, NULL, NULL));
115
116}
117
121void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m,
123 cl_int err;
124
125 if (math_program == NULL)
127
128 cl_kernel kernel = clCreateKernel(math_program, "masked_scatter_copy_kernel",
129 &err);
130 CL_CHECK(err);
131
132 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
133 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
134 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
135 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
136 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), m));
137
138 const int nb = ((*n) + 256 - 1) / 256;
139 const size_t global_item_size = 256 * nb;
140 const size_t local_item_size = 256;
141
144 0, NULL, NULL));
145
146}
147
151void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size,
153 cl_int err;
154
155 if (math_program == NULL)
157
158 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
159 CL_CHECK(err);
160
161 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
162 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
163 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
164 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
165 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
166
167 const int nb = ((*mask_size) + 256 - 1) / 256;
168 const size_t global_item_size = 256 * nb;
169 const size_t local_item_size = 256;
170
173 0, NULL, NULL));
174 }
175
181 real zero = 0.0;
182
184 (*n) * sizeof(real), 0, NULL, &wait_kern));
186}
187
193 real one = 1.0;
194
196 (*n) * sizeof(real), 0, NULL, &wait_kern));
198}
199
203void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue) {
204 cl_int err;
205
206 if (math_program == NULL)
208
209 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
210 CL_CHECK(err);
211
212 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
213 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
214 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
215
216 const int nb = ((*n) + 256 - 1) / 256;
217 const size_t global_item_size = 256 * nb;
218 const size_t local_item_size = 256;
219
222 0, NULL, NULL));
223}
224
228void opencl_cmult2(void *a, void *b, real *c, int *n,
230 cl_int err;
231
232 if (math_program == NULL)
234
235 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
236 CL_CHECK(err);
237
238 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
239 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
240 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
241 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
242
243 const int nb = ((*n) + 256 - 1) / 256;
244 const size_t global_item_size = 256 * nb;
245 const size_t local_item_size = 256;
246
249 0, NULL, NULL));
250}
251
255void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue) {
256 cl_int err;
257
258 if (math_program == NULL)
260
261 cl_kernel kernel = clCreateKernel(math_program, "cdiv_kernel", &err);
262 CL_CHECK(err);
263
264 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
265 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
266 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
267
268 const int nb = ((*n) + 256 - 1) / 256;
269 const size_t global_item_size = 256 * nb;
270 const size_t local_item_size = 256;
271
274 0, NULL, NULL));
275}
276
280void opencl_cdiv2(void *a, void *b, real *c, int *n,
282 cl_int err;
283
284 if (math_program == NULL)
286
287 cl_kernel kernel = clCreateKernel(math_program, "cdiv2_kernel", &err);
288 CL_CHECK(err);
289
290 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
291 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
292 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
293 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
294
295 const int nb = ((*n) + 256 - 1) / 256;
296 const size_t global_item_size = 256 * nb;
297 const size_t local_item_size = 256;
298
301 0, NULL, NULL));
302}
303
307void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue) {
308 cl_int err;
309
310 if (math_program == NULL)
312
313 cl_kernel kernel = clCreateKernel(math_program, "radd_kernel", &err);
314 CL_CHECK(err);
315
316 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
317 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
318 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
319
320 const int nb = ((*n) + 256 - 1) / 256;
321 const size_t global_item_size = 256 * nb;
322 const size_t local_item_size = 256;
323
326 0, NULL, NULL));
327}
328
332void opencl_cadd2(void *a, void *b, real *c, int *n,
334 cl_int err;
335
336 if (math_program == NULL)
338
339 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
340 CL_CHECK(err);
341
342 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
343 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
344 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
345 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
346
347 const int nb = ((*n) + 256 - 1) / 256;
348 const size_t global_item_size = 256 * nb;
349 const size_t local_item_size = 256;
350
353 0, NULL, NULL));
354}
355
359void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue) {
360 cl_int err;
361
362 if (math_program == NULL)
364
365 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
366 CL_CHECK(err);
367
368 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
369 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
370 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
371
372 const int nb = ((*n) + 256 - 1) / 256;
373 const size_t global_item_size = 256 * nb;
374 const size_t local_item_size = 256;
375
378 0, NULL, NULL));
379}
380
385void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
386 cl_int err;
387
388 if (math_program == NULL)
390
391 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
392 CL_CHECK(err);
393
394 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
395 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
396 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
397
398 const int nb = ((*n) + 256 - 1) / 256;
399 const size_t global_item_size = 256 * nb;
400 const size_t local_item_size = 256;
401
404 0, NULL, NULL));
405}
406
411void opencl_add3(void *a, void *b, void *c, int *n,
413 cl_int err;
414
415 if (math_program == NULL)
417
418 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
419 CL_CHECK(err);
420
421 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
422 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
423 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
424 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
425
426 const int nb = ((*n) + 256 - 1) / 256;
427 const size_t global_item_size = 256 * nb;
428 const size_t local_item_size = 256;
429
432 0, NULL, NULL));
433}
434
439void opencl_add4(void *a, void *b, void *c, void *d, int *n,
441 cl_int err;
442
443 if (math_program == NULL)
445
446 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
447 CL_CHECK(err);
448
449 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
450 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
451 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
452 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
453 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
454
455 const int nb = ((*n) + 256 - 1) / 256;
456 const size_t global_item_size = 256 * nb;
457 const size_t local_item_size = 256;
458
461 0, NULL, NULL));
462}
463
469void opencl_add2s1(void *a, void *b, real *c1, int *n,
471 cl_int err;
472
473 if (math_program == NULL)
475
476 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
477 CL_CHECK(err);
478
479 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
480 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
481 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
482 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
483
484 const int nb = ((*n) + 256 - 1) / 256;
485 const size_t global_item_size = 256 * nb;
486 const size_t local_item_size = 256;
487
490 0, NULL, NULL));
491}
492
498void opencl_add2s2(void *a, void *b, real *c1, int *n,
500 cl_int err;
501
502 if (math_program == NULL)
504
505 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
506 CL_CHECK(err);
507
508 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
509 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
510 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
511 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
512
513 const int nb = ((*n) + 256 - 1) / 256;
514 const size_t global_item_size = 256 * nb;
515 const size_t local_item_size = 256;
516
519 0, NULL, NULL));
520}
521
528void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n,
530 cl_int err;
531
532 if (math_program == NULL)
534
535 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
536 CL_CHECK(err);
537
538 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
539 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
540 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
541 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
542 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
543
544 const int nb = ((*n) + 256 - 1) / 256;
545 const size_t global_item_size = 256 * nb;
546 const size_t local_item_size = 256;
547
550 0, NULL, NULL));
551
552}
553
559void opencl_addsqr2s2(void *a, void *b, real *c1, int *n,
561 cl_int err;
562
563 if (math_program == NULL)
565
566 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
567 CL_CHECK(err);
568
569 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
570 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
571 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
572 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
573
574 const int nb = ((*n) + 256 - 1) / 256;
575 const size_t global_item_size = 256 * nb;
576 const size_t local_item_size = 256;
577
580 0, NULL, NULL));
581}
582
587void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n,
589 cl_int err;
590
591 if (math_program == NULL)
593
594 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
595 CL_CHECK(err);
596
597 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
598 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
599 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
600 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
601 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
602 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
603
604 const int nb = ((*n) + 256 - 1) / 256;
605 const size_t global_item_size = 256 * nb;
606 const size_t local_item_size = 256;
607
610 0, NULL, NULL));
611}
612
617void opencl_add4s3(void *a, void *b, void * c, void * d, real *c1, real *c2,
618 real *c3, int *n, cl_command_queue cmd_queue) {
619 cl_int err;
620
621 if (math_program == NULL)
623
624 cl_kernel kernel = clCreateKernel(math_program, "add4s3_kernel", &err);
625 CL_CHECK(err);
626
627 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
628 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
629 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
630 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
631 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c1));
632 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c2));
633 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c3));
634 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
635
636 const int nb = ((*n) + 256 - 1) / 256;
637 const size_t global_item_size = 256 * nb;
638 const size_t local_item_size = 256;
639
642 0, NULL, NULL));
643}
644
649void opencl_add5s4(void *a, void *b, void * c, void * d, void * e, real *c1,
650 real *c2, real *c3, real * c4, int *n,
652 cl_int err;
653
654 if (math_program == NULL)
656
657 cl_kernel kernel = clCreateKernel(math_program, "add5s4_kernel", &err);
658 CL_CHECK(err);
659
660 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
661 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
662 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
663 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
664 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &e));
665 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(real), c1));
666 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(real), c2));
667 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(real), c3));
668 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(real), c4));
669 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
670
671 const int nb = ((*n) + 256 - 1) / 256;
672 const size_t global_item_size = 256 * nb;
673 const size_t local_item_size = 256;
674
677 0, NULL, NULL));
678}
679
685 cl_int err;
686
687 if (math_program == NULL)
689
690 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
691 CL_CHECK(err);
692
693 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
694 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
695
696 const int nb = ((*n) + 256 - 1) / 256;
697 const size_t global_item_size = 256 * nb;
698 const size_t local_item_size = 256;
699
702 0, NULL, NULL));
703}
704
709void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
710 cl_int err;
711
712 if (math_program == NULL)
714
715 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
716 CL_CHECK(err);
717
718 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
719 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
720 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
721
722 const int nb = ((*n) + 256 - 1) / 256;
723 const size_t global_item_size = 256 * nb;
724 const size_t local_item_size = 256;
725
728 0, NULL, NULL));
729}
730
735void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
736 cl_int err;
737
738 if (math_program == NULL)
740
741 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
742 CL_CHECK(err);
743
744 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
745 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
746 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
747
748 const int nb = ((*n) + 256 - 1) / 256;
749 const size_t global_item_size = 256 * nb;
750 const size_t local_item_size = 256;
751
754 0, NULL, NULL));
755}
756
761void opencl_col3(void *a, void *b, void *c, int *n,
763 cl_int err;
764
765 if (math_program == NULL)
767
768 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
769 CL_CHECK(err);
770
771 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
772 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
773 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
774 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
775
776 const int nb = ((*n) + 256 - 1) / 256;
777 const size_t global_item_size = 256 * nb;
778 const size_t local_item_size = 256;
779
782 0, NULL, NULL));
783}
784
789void opencl_subcol3(void *a, void *b, void *c, int *n,
791 cl_int err;
792
793 if (math_program == NULL)
795
796 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
797 CL_CHECK(err);
798
799 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
800 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
801 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
802 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
803
804 const int nb = ((*n) + 256 - 1) / 256;
805 const size_t global_item_size = 256 * nb;
806 const size_t local_item_size = 256;
807
810 0, NULL, NULL));
811}
812
817void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue) {
818 cl_int err;
819
820 if (math_program == NULL)
822
823 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
824 CL_CHECK(err);
825
826 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
827 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
828 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
829
830 const int nb = ((*n) + 256 - 1) / 256;
831 const size_t global_item_size = 256 * nb;
832 const size_t local_item_size = 256;
833
836 0, NULL, NULL));
837}
838
843void opencl_sub3(void *a, void *b, void *c, int *n,
845 cl_int err;
846
847 if (math_program == NULL)
849
850 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
851 CL_CHECK(err);
852
853 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
854 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
855 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
856 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
857
858 const int nb = ((*n) + 256 - 1) / 256;
859 const size_t global_item_size = 256 * nb;
860 const size_t local_item_size = 256;
861
864 0, NULL, NULL));
865}
866
871void opencl_addcol3(void *a, void *b, void *c, int *n,
873 cl_int err;
874
875 if (math_program == NULL)
877
878 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
879 CL_CHECK(err);
880
881 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
882 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
883 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
884 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
885
886 const int nb = ((*n) + 256 - 1) / 256;
887 const size_t global_item_size = 256 * nb;
888 const size_t local_item_size = 256;
889
892 0, NULL, NULL));
893}
894
899void opencl_addcol4(void *a, void *b, void *c, void *d, int *n,
901 cl_int err;
902
903 if (math_program == NULL)
905
906 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
907 CL_CHECK(err);
908
909 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
910 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
911 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
912 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
913 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
914
915 const int nb = ((*n) + 256 - 1) / 256;
916 const size_t global_item_size = 256 * nb;
917 const size_t local_item_size = 256;
918
921 0, NULL, NULL));
922}
923
928void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n,
930 cl_int err;
931
932 if (math_program == NULL)
934
935 cl_kernel kernel = clCreateKernel(math_program, "addcol3s2_kernel", &err);
936 CL_CHECK(err);
937
938 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
939 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
940 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
941 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), s));
942 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
943
944 const int nb = ((*n) + 256 - 1) / 256;
945 const size_t global_item_size = 256 * nb;
946 const size_t local_item_size = 256;
947
950 0, NULL, NULL));
951}
952
958void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
959 void *v1, void *v2, void *v3, int *n,
961 cl_int err;
962
963 if (math_program == NULL)
965
966 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
967 CL_CHECK(err);
968
969 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
970 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
971 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
972 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
973 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
974 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
975 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
976 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
977
978 const int nb = ((*n) + 256 - 1) / 256;
979 const size_t global_item_size = 256 * nb;
980 const size_t local_item_size = 256;
981
984 0, NULL, NULL));
985}
986
992void opencl_vcross(void *u1, void *u2, void *u3,
993 void *v1, void *v2, void *v3,
994 void *w1, void *w2, void *w3,
995 int *n, cl_command_queue cmd_queue) {
996
997 cl_int err;
998
999 if (math_program == NULL)
1001
1002 cl_kernel kernel = clCreateKernel(math_program, "vcross_kernel", &err);
1003 CL_CHECK(err);
1004
1005 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &u1));
1006 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u2));
1007 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u3));
1008 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &v1));
1009 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v2));
1010 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v3));
1011 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &w1));
1012 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &w2));
1013 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &w3));
1014 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), n));
1015
1016 const int nb = ((*n) + 256 - 1) / 256;
1017 const size_t global_item_size = 256 * nb;
1018 const size_t local_item_size = 256;
1019
1022 0, NULL, NULL));
1023
1024}
1025
1027int red_s = 0;
1030
1035real opencl_glsc3(void *a, void *b, void *c, int *n,
1037 cl_int err;
1039 int i;
1040
1041 if (math_program == NULL)
1043
1044 const int nb = ((*n) + 256 - 1) / 256;
1045 const size_t global_item_size = 256 * nb;
1046 const size_t local_item_size = 256;
1047
1048 if ( nb > red_s){
1049 red_s = nb;
1050 if (bufred != NULL) {
1051 free(bufred);
1053 }
1054 bufred = (real *) malloc(nb * sizeof(real));
1055
1057 nb * sizeof(real), NULL, &err);
1058 CL_CHECK(err);
1059 }
1060
1061 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
1062 CL_CHECK(err);
1063
1064 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1065 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1066 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
1067 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1068 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
1069
1072 0, NULL, &kern_wait));
1073
1075 nb * sizeof(real), bufred, 1,
1076 &kern_wait, NULL));
1077
1078 real res = 0.0;
1079 for (i = 0; i < nb; i++) {
1080 res += bufred[i];
1081 }
1082
1083 return res;
1084}
1085
1090void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n,
1092 int i, k;
1093 cl_int err;
1095
1096 if (math_program == NULL)
1098
1099 int pow2 = 1;
1100 while(pow2 < (*j)){
1101 pow2 = 2*pow2;
1102 }
1103
1104 const int nt = 256 / pow2;
1105 const int nb = ((*n) + nt - 1) / nt;
1106 const size_t local_item_size[2] = {nt, pow2};
1107 const size_t global_item_size[2] = {nb * nt, pow2};
1108
1109 if((*j)*nb > red_s) {
1110 red_s = (*j)*nb;
1111 if (bufred != NULL) {
1112 free(bufred);
1114 }
1115 bufred = (real *) malloc((*j) * nb * sizeof(real));
1116
1118 (*j) * nb * sizeof(real), NULL, &err);
1119 CL_CHECK(err);
1120 }
1121
1122 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
1123 CL_CHECK(err);
1124
1125 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
1126 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
1127 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
1128 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
1129 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
1130 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
1131
1134 0, NULL, &kern_wait));
1135
1137 (*j) * nb * sizeof(real),
1138 bufred, 1, &kern_wait, NULL));
1139
1140 for (k = 0; k < (*j); k++) {
1141 h[k] = 0.0;
1142 }
1143
1144 for (i = 0; i < nb; i++) {
1145 for (k = 0; k < (*j); k++) {
1146 h[k] += bufred[i*(*j)+k];
1147 }
1148 }
1149}
1150
1156 cl_int err;
1158 int i;
1159
1160 if (math_program == NULL)
1162
1163 const int nb = ((*n) + 256 - 1) / 256;
1164 const size_t global_item_size = 256 * nb;
1165 const size_t local_item_size = 256;
1166
1167 real * buf = (real *) malloc(nb * sizeof(real));
1168
1169 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
1170 CL_CHECK(err);
1171
1173 nb * sizeof(real), NULL, &err);
1174 CL_CHECK(err);
1175
1176 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1177 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1178 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1179 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1180
1183 0, NULL, &kern_wait));
1184
1185
1187 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1188
1189 real res = 0.0;
1190 for (i = 0; i < nb; i++) {
1191 res += buf[i];
1192 }
1193
1194 free(buf);
1196
1197 return res;
1198}
1199
1205 cl_int err;
1207 int i;
1208
1209 if (math_program == NULL)
1211
1212 const int nb = ((*n) + 256 - 1) / 256;
1213 const size_t global_item_size = 256 * nb;
1214 const size_t local_item_size = 256;
1215
1216 real * buf = (real *) malloc(nb * sizeof(real));
1217
1218 cl_kernel kernel = clCreateKernel(math_program, "glsubnorm2_kernel", &err);
1219 CL_CHECK(err);
1220
1222 nb * sizeof(real), NULL, &err);
1223 CL_CHECK(err);
1224
1225 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1226 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
1227 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
1228 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
1229
1232 0, NULL, &kern_wait));
1233
1234
1236 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1237
1238 real res = 0.0;
1239 for (i = 0; i < nb; i++) {
1240 res += buf[i];
1241 }
1242
1243 free(buf);
1245
1246 return res;
1247}
1248
1254 cl_int err;
1256 int i;
1257
1258 if (math_program == NULL)
1260
1261 const int nb = ((*n) + 256 - 1) / 256;
1262 const size_t global_item_size = 256 * nb;
1263 const size_t local_item_size = 256;
1264
1265 real * buf = (real *) malloc(nb * sizeof(real));
1266
1267 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
1268 CL_CHECK(err);
1269
1271 nb * sizeof(real), NULL, &err);
1272 CL_CHECK(err);
1273
1274 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1275 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
1276 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1277
1280 0, NULL, &kern_wait));
1281
1282
1284 nb * sizeof(real), buf, 1, &kern_wait, NULL));
1285
1286 real res = 0.0;
1287 for (i = 0; i < nb; i++) {
1288 res += buf[i];
1289 }
1290
1291 free(buf);
1293
1294 return res;
1295}
1296
1297
1301void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue) {
1302 cl_int err;
1303
1304 if (math_program == NULL)
1306
1307 cl_kernel kernel = clCreateKernel(math_program, "iadd_kernel", &err);
1308 CL_CHECK(err);
1309
1310 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
1311 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), c));
1312 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
1313
1314 const int nb = ((*n) + 256 - 1) / 256;
1315 const size_t global_item_size = 256 * nb;
1316 const size_t local_item_size = 256;
1317
1320 0, NULL, NULL));
1321}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
const int e
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_iadd(void *a, int *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1301
void opencl_col3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:761
void opencl_cdiv(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:255
void opencl_masked_scatter_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:121
void opencl_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n, cl_command_queue cmd_queue)
Definition math.c:992
void opencl_masked_gather_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:91
void opencl_sub2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:817
void opencl_col2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:735
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:528
void opencl_sub3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:843
void opencl_add2s1(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:469
void opencl_addcol3s2(void *a, void *b, void *c, real *s, int *n, cl_command_queue cmd_queue)
Definition math.c:928
void opencl_invcol1(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:684
void opencl_add3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:411
real opencl_glsc3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:1035
void opencl_rone(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:191
void opencl_add5s4(void *a, void *b, void *c, void *d, void *e, real *c1, real *c2, real *c3, real *c4, int *n, cl_command_queue cmd_queue)
Definition math.c:649
void opencl_add4s3(void *a, void *b, void *c, void *d, real *c1, real *c2, real *c3, int *n, cl_command_queue cmd_queue)
Definition math.c:617
void opencl_cmult(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:203
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size, cl_command_queue cmd_queue)
Definition math.c:151
void opencl_cadd2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:332
real * bufred
Definition math.c:1028
real opencl_glsum(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:1253
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m, cl_command_queue cmd_queue)
Definition math.c:62
void opencl_add4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:439
void opencl_radd(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:307
void opencl_add2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:498
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n, cl_command_queue cmd_queue)
Definition math.c:958
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n, cl_command_queue cmd_queue)
Definition math.c:587
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n, cl_command_queue cmd_queue)
Definition math.c:1090
real opencl_glsubnorm2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1204
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n, cl_command_queue cmd_queue)
Definition math.c:559
int red_s
Definition math.c:1027
void opencl_addcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:871
cl_mem bufred_d
Definition math.c:1029
void opencl_rzero(void *a, int *n, cl_command_queue cmd_queue)
Definition math.c:179
void opencl_copy(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:53
void opencl_subcol3(void *a, void *b, void *c, int *n, cl_command_queue cmd_queue)
Definition math.c:789
void opencl_add2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:385
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n, cl_command_queue cmd_queue)
Definition math.c:899
void opencl_invcol2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:709
real opencl_glsc2(void *a, void *b, int *n, cl_command_queue cmd_queue)
Definition math.c:1155
void opencl_cdiv2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:280
void opencl_cfill(void *a, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:359
void opencl_cmult2(void *a, void *b, real *c, int *n, cl_command_queue cmd_queue)
Definition math.c:228
Object for handling masks in Neko.
Definition mask.f90:34
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program