Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.c
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2024, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifdef __APPLE__
36#include <OpenCL/cl.h>
37#else
38#include <CL/cl.h>
39#endif
40
41#include <stdio.h>
42#include <stdlib.h>
44#include <device/opencl/jit.h>
46#include <device/opencl/check.h>
47
48#include "math_kernel.cl.h"
49
53void opencl_copy(void *a, void *b, int *n) {
55 b, a, 0, 0, (*n) * sizeof(real),
56 0, NULL, NULL));
57}
58
62void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
63 cl_int err;
64
65 if (math_program == NULL)
67
68 cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
70
71 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
72 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
73 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
74 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
75 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), m));
76
77 const int nb = ((*n) + 256 - 1) / 256;
78 const size_t global_item_size = 256 * nb;
79 const size_t local_item_size = 256;
80
83 0, NULL, NULL));
84
85}
86
90void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size) {
91 cl_int err;
92
93 if (math_program == NULL)
95
96 cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
98
99 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
100 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
101 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
102 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
103 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
104
105 const int nb = ((*mask_size) + 256 - 1) / 256;
106 const size_t global_item_size = 256 * nb;
107 const size_t local_item_size = 256;
108
111 0, NULL, NULL));
112 }
113
117void opencl_rzero(void *a, int *n) {
119 real zero = 0.0;
120
122 a, &zero, sizeof(real), 0,
123 (*n) * sizeof(real), 0, NULL, &wait_kern));
125}
126
130void opencl_rone(void *a, int *n) {
132 real one = 1.0;
133
135 a, &one, sizeof(real), 0,
136 (*n) * sizeof(real), 0, NULL, &wait_kern));
138}
139
143void opencl_cmult2(void *a, void *b, real *c, int *n) {
144 cl_int err;
145
146 if (math_program == NULL)
148
149 cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
150 CL_CHECK(err);
151
152 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
153 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
154 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
155 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
156
157 const int nb = ((*n) + 256 - 1) / 256;
158 const size_t global_item_size = 256 * nb;
159 const size_t local_item_size = 256;
160
163 0, NULL, NULL));
164}
165
166
170void opencl_cmult(void *a, real *c, int *n) {
171 cl_int err;
172
173 if (math_program == NULL)
175
176 cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
177 CL_CHECK(err);
178
179 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
180 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
181 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
182
183 const int nb = ((*n) + 256 - 1) / 256;
184 const size_t global_item_size = 256 * nb;
185 const size_t local_item_size = 256;
186
189 0, NULL, NULL));
190}
191
195void opencl_cadd(void *a, real *c, int *n) {
196 cl_int err;
197
198 if (math_program == NULL)
200
201 cl_kernel kernel = clCreateKernel(math_program, "cadd_kernel", &err);
202 CL_CHECK(err);
203
204 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
205 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
206 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
207
208 const int nb = ((*n) + 256 - 1) / 256;
209 const size_t global_item_size = 256 * nb;
210 const size_t local_item_size = 256;
211
214 0, NULL, NULL));
215}
216
220void opencl_cadd2(void *a, void *b, real *c, int *n) {
221 cl_int err;
222
223 if (math_program == NULL)
225
226 cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
227 CL_CHECK(err);
228
229 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
230 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
231 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
232 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
233
234 const int nb = ((*n) + 256 - 1) / 256;
235 const size_t global_item_size = 256 * nb;
236 const size_t local_item_size = 256;
237
240 0, NULL, NULL));
241}
242
246void opencl_cfill(void *a, real *c, int *n) {
247 cl_int err;
248
249 if (math_program == NULL)
251
252 cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
253 CL_CHECK(err);
254
255 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
256 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
257 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
258
259 const int nb = ((*n) + 256 - 1) / 256;
260 const size_t global_item_size = 256 * nb;
261 const size_t local_item_size = 256;
262
265 0, NULL, NULL));
266}
267
272void opencl_add2(void *a, void *b, int *n) {
273 cl_int err;
274
275 if (math_program == NULL)
277
278 cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
279 CL_CHECK(err);
280
281 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
282 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
283 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
284
285 const int nb = ((*n) + 256 - 1) / 256;
286 const size_t global_item_size = 256 * nb;
287 const size_t local_item_size = 256;
288
291 0, NULL, NULL));
292}
293
298void opencl_add3(void *a, void *b, void *c, int *n) {
299 cl_int err;
300
301 if (math_program == NULL)
303
304 cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
305 CL_CHECK(err);
306
307 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
308 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
309 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
310 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
311
312 const int nb = ((*n) + 256 - 1) / 256;
313 const size_t global_item_size = 256 * nb;
314 const size_t local_item_size = 256;
315
318 0, NULL, NULL));
319}
320
325void opencl_add4(void *a, void *b, void *c, void *d, int *n) {
326 cl_int err;
327
328 if (math_program == NULL)
330
331 cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
332 CL_CHECK(err);
333
334 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
335 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
336 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
337 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
338 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
339
340 const int nb = ((*n) + 256 - 1) / 256;
341 const size_t global_item_size = 256 * nb;
342 const size_t local_item_size = 256;
343
346 0, NULL, NULL));
347}
348
354void opencl_add2s1(void *a, void *b, real *c1, int *n) {
355 cl_int err;
356
357 if (math_program == NULL)
359
360 cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
361 CL_CHECK(err);
362
363 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
364 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
365 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
366 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
367
368 const int nb = ((*n) + 256 - 1) / 256;
369 const size_t global_item_size = 256 * nb;
370 const size_t local_item_size = 256;
371
374 0, NULL, NULL));
375}
376
382void opencl_add2s2(void *a, void *b, real *c1, int *n) {
383 cl_int err;
384
385 if (math_program == NULL)
387
388 cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
389 CL_CHECK(err);
390
391 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
392 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
393 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
394 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
395
396 const int nb = ((*n) + 256 - 1) / 256;
397 const size_t global_item_size = 256 * nb;
398 const size_t local_item_size = 256;
399
402 0, NULL, NULL));
403}
404
411void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n) {
412 cl_int err;
413
414 if (math_program == NULL)
416
417 cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
418 CL_CHECK(err);
419
420 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
421 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
422 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
423 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
424 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
425
426 const int nb = ((*n) + 256 - 1) / 256;
427 const size_t global_item_size = 256 * nb;
428 const size_t local_item_size = 256;
429
432 0, NULL, NULL));
433
434}
435
441void opencl_addsqr2s2(void *a, void *b, real *c1, int *n) {
442 cl_int err;
443
444 if (math_program == NULL)
446
447 cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
448 CL_CHECK(err);
449
450 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
451 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
452 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
453 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
454
455 const int nb = ((*n) + 256 - 1) / 256;
456 const size_t global_item_size = 256 * nb;
457 const size_t local_item_size = 256;
458
461 0, NULL, NULL));
462}
463
468void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n) {
469 cl_int err;
470
471 if (math_program == NULL)
473
474 cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
475 CL_CHECK(err);
476
477 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
478 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
479 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
480 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
481 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
482 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
483
484 const int nb = ((*n) + 256 - 1) / 256;
485 const size_t global_item_size = 256 * nb;
486 const size_t local_item_size = 256;
487
490 0, NULL, NULL));
491}
492
497void opencl_invcol1(void *a, int *n) {
498 cl_int err;
499
500 if (math_program == NULL)
502
503 cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
504
505 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
506 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
507
508 const int nb = ((*n) + 256 - 1) / 256;
509 const size_t global_item_size = 256 * nb;
510 const size_t local_item_size = 256;
511
514 0, NULL, NULL));
515}
516
521void opencl_invcol2(void *a, void *b, int *n) {
522 cl_int err;
523
524 if (math_program == NULL)
526
527 cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
528 CL_CHECK(err);
529
530 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
531 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
532 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
533
534 const int nb = ((*n) + 256 - 1) / 256;
535 const size_t global_item_size = 256 * nb;
536 const size_t local_item_size = 256;
537
540 0, NULL, NULL));
541}
542
547void opencl_col2(void *a, void *b, int *n) {
548 cl_int err;
549
550 if (math_program == NULL)
552
553 cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
554 CL_CHECK(err);
555
556 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
557 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
558 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
559
560 const int nb = ((*n) + 256 - 1) / 256;
561 const size_t global_item_size = 256 * nb;
562 const size_t local_item_size = 256;
563
566 0, NULL, NULL));
567}
568
573void opencl_col3(void *a, void *b, void *c, int *n) {
574 cl_int err;
575
576 if (math_program == NULL)
578
579 cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
580 CL_CHECK(err);
581
582 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
583 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
584 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
585 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
586
587 const int nb = ((*n) + 256 - 1) / 256;
588 const size_t global_item_size = 256 * nb;
589 const size_t local_item_size = 256;
590
593 0, NULL, NULL));
594}
595
600void opencl_subcol3(void *a, void *b, void *c, int *n) {
601 cl_int err;
602
603 if (math_program == NULL)
605
606 cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
607 CL_CHECK(err);
608
609 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
610 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
611 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
612 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
613
614 const int nb = ((*n) + 256 - 1) / 256;
615 const size_t global_item_size = 256 * nb;
616 const size_t local_item_size = 256;
617
620 0, NULL, NULL));
621}
622
627void opencl_sub2(void *a, void *b, int *n) {
628 cl_int err;
629
630 if (math_program == NULL)
632
633 cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
634 CL_CHECK(err);
635
636 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
637 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
638 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
639
640 const int nb = ((*n) + 256 - 1) / 256;
641 const size_t global_item_size = 256 * nb;
642 const size_t local_item_size = 256;
643
646 0, NULL, NULL));
647}
648
653void opencl_sub3(void *a, void *b, void *c, int *n) {
654 cl_int err;
655
656 if (math_program == NULL)
658
659 cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
660 CL_CHECK(err);
661
662 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
663 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
664 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
665 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
666
667 const int nb = ((*n) + 256 - 1) / 256;
668 const size_t global_item_size = 256 * nb;
669 const size_t local_item_size = 256;
670
673 0, NULL, NULL));
674}
675
680void opencl_addcol3(void *a, void *b, void *c, int *n) {
681 cl_int err;
682
683 if (math_program == NULL)
685
686 cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
687 CL_CHECK(err);
688
689 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
690 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
691 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
692 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
693
694 const int nb = ((*n) + 256 - 1) / 256;
695 const size_t global_item_size = 256 * nb;
696 const size_t local_item_size = 256;
697
700 0, NULL, NULL));
701}
702
707void opencl_addcol4(void *a, void *b, void *c, void *d, int *n) {
708 cl_int err;
709
710 if (math_program == NULL)
712
713 cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
714 CL_CHECK(err);
715
716 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
717 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
718 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
719 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
720 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
721
722 const int nb = ((*n) + 256 - 1) / 256;
723 const size_t global_item_size = 256 * nb;
724 const size_t local_item_size = 256;
725
728 0, NULL, NULL));
729}
730
736void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
737 void *v1, void *v2, void *v3, int *n) {
738 cl_int err;
739
740 if (math_program == NULL)
742
743 cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
744 CL_CHECK(err);
745
746 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
747 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
748 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
749 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
750 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
751 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
752 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
753 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
754
755 const int nb = ((*n) + 256 - 1) / 256;
756 const size_t global_item_size = 256 * nb;
757 const size_t local_item_size = 256;
758
761 0, NULL, NULL));
762}
763
765int red_s = 0;
768
773real opencl_glsc3(void *a, void *b, void *c, int *n) {
774 cl_int err;
776 int i;
777
778 if (math_program == NULL)
780
781 const int nb = ((*n) + 256 - 1) / 256;
782 const size_t global_item_size = 256 * nb;
783 const size_t local_item_size = 256;
784
785 if ( nb > red_s){
786 red_s = nb;
787 if (bufred != NULL) {
788 free(bufred);
790 }
791 bufred = (real *) malloc(nb * sizeof(real));
792
794 nb * sizeof(real), NULL, &err);
795 CL_CHECK(err);
796 }
797
798 cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
799 CL_CHECK(err);
800
801 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
802 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
803 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
804 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
805 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
806
809 0, NULL, &kern_wait));
810
812 CL_TRUE, 0, nb * sizeof(real), bufred, 1,
813 &kern_wait, NULL));
814
815 real res = 0.0;
816 for (i = 0; i < nb; i++) {
817 res += bufred[i];
818 }
819
820 return res;
821}
822
827void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n){
828 int i, k;
829 cl_int err;
831
832 if (math_program == NULL)
834
835 int pow2 = 1;
836 while(pow2 < (*j)){
837 pow2 = 2*pow2;
838 }
839
840 const int nt = 256 / pow2;
841 const int nb = ((*n) + nt - 1) / nt;
842 const size_t local_item_size[2] = {nt, pow2};
843 const size_t global_item_size[2] = {nb * nt, pow2};
844
845 if((*j)*nb > red_s) {
846 red_s = (*j)*nb;
847 if (bufred != NULL) {
848 free(bufred);
850 }
851 bufred = (real *) malloc((*j) * nb * sizeof(real));
852
854 (*j) * nb * sizeof(real), NULL, &err);
855 CL_CHECK(err);
856 }
857
858 cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
859 CL_CHECK(err);
860
861 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
862 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
863 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
864 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
865 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
866 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
867
870 0, NULL, &kern_wait));
871
873 bufred_d, CL_TRUE, 0, (*j) * nb * sizeof(real),
874 bufred, 1, &kern_wait, NULL));
875
876 for (k = 0; k < (*j); k++) {
877 h[k] = 0.0;
878 }
879
880 for (i = 0; i < nb; i++) {
881 for (k = 0; k < (*j); k++) {
882 h[k] += bufred[i*(*j)+k];
883 }
884 }
885}
886
891real opencl_glsc2(void *a, void *b, int *n) {
892 cl_int err;
894 int i;
895
896 if (math_program == NULL)
898
899 const int nb = ((*n) + 256 - 1) / 256;
900 const size_t global_item_size = 256 * nb;
901 const size_t local_item_size = 256;
902
903 real * buf = (real *) malloc(nb * sizeof(real));
904
905 cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
906 CL_CHECK(err);
907
909 nb * sizeof(real), NULL, &err);
910 CL_CHECK(err);
911
912 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
913 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
914 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
915 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
916
919 0, NULL, &kern_wait));
920
921
923 0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
924
925 real res = 0.0;
926 for (i = 0; i < nb; i++) {
927 res += buf[i];
928 }
929
930 free(buf);
932
933 return res;
934}
935
940real opencl_glsum(void *a, int *n) {
941 cl_int err;
943 int i;
944
945 if (math_program == NULL)
947
948 const int nb = ((*n) + 256 - 1) / 256;
949 const size_t global_item_size = 256 * nb;
950 const size_t local_item_size = 256;
951
952 real * buf = (real *) malloc(nb * sizeof(real));
953
954 cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
955 CL_CHECK(err);
956
958 nb * sizeof(real), NULL, &err);
959 CL_CHECK(err);
960
961 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
962 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
963 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
964
967 0, NULL, &kern_wait));
968
969
971 0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
972
973 real res = 0.0;
974 for (i = 0; i < nb; i++) {
975 res += buf[i];
976 }
977
978 free(buf);
980
981 return res;
982}
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition jit.c:50
void opencl_add3(void *a, void *b, void *c, int *n)
Definition math.c:298
void opencl_addcol3(void *a, void *b, void *c, int *n)
Definition math.c:680
void opencl_invcol1(void *a, int *n)
Definition math.c:497
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition math.c:827
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition math.c:441
void opencl_cmult(void *a, real *c, int *n)
Definition math.c:170
void opencl_sub3(void *a, void *b, void *c, int *n)
Definition math.c:653
void opencl_rone(void *a, int *n)
Definition math.c:130
void opencl_cadd(void *a, real *c, int *n)
Definition math.c:195
void opencl_cmult2(void *a, void *b, real *c, int *n)
Definition math.c:143
void opencl_add4(void *a, void *b, void *c, void *d, int *n)
Definition math.c:325
real opencl_glsc3(void *a, void *b, void *c, int *n)
Definition math.c:773
void opencl_add2s2(void *a, void *b, real *c1, int *n)
Definition math.c:382
void opencl_rzero(void *a, int *n)
Definition math.c:117
void opencl_sub2(void *a, void *b, int *n)
Definition math.c:627
void opencl_col2(void *a, void *b, int *n)
Definition math.c:547
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition math.c:707
void opencl_col3(void *a, void *b, void *c, int *n)
Definition math.c:573
real * bufred
Definition math.c:766
void opencl_subcol3(void *a, void *b, void *c, int *n)
Definition math.c:600
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition math.c:468
int red_s
Definition math.c:765
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
Definition math.c:411
void opencl_invcol2(void *a, void *b, int *n)
Definition math.c:521
cl_mem bufred_d
Definition math.c:767
void opencl_cadd2(void *a, void *b, real *c, int *n)
Definition math.c:220
void opencl_add2(void *a, void *b, int *n)
Definition math.c:272
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.c:62
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size)
Definition math.c:90
void opencl_cfill(void *a, real *c, int *n)
Definition math.c:246
void opencl_add2s1(void *a, void *b, real *c1, int *n)
Definition math.c:354
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition math.c:736
real opencl_glsc2(void *a, void *b, int *n)
Definition math.c:891
real opencl_glsum(void *a, int *n)
Definition math.c:940
void opencl_copy(void *a, void *b, int *n)
Definition math.c:53
#define CL_CHECK(err)
Definition check.h:12
real * buf
Definition pipecg_aux.cu:42
void * math_program