Neko  0.8.99
A portable framework for high-order spectral element flow simulations
math.c
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2024, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #ifdef __APPLE__
36 #include <OpenCL/cl.h>
37 #else
38 #include <CL/cl.h>
39 #endif
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <device/device_config.h>
44 #include <device/opencl/jit.h>
45 #include <device/opencl/prgm_lib.h>
46 #include <device/opencl/check.h>
47 
48 #include "math_kernel.cl.h"
49 
53 void opencl_copy(void *a, void *b, int *n) {
54  CL_CHECK(clEnqueueCopyBuffer((cl_command_queue) glb_cmd_queue,
55  b, a, 0, 0, (*n) * sizeof(real),
56  0, NULL, NULL));
57 }
58 
62 void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
63  cl_int err;
64 
65  if (math_program == NULL)
66  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
67 
68  cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
69  CL_CHECK(err);
70 
71  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
72  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
73  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
74  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
75  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), m));
76 
77  const int nb = ((*n) + 256 - 1) / 256;
78  const size_t global_item_size = 256 * nb;
79  const size_t local_item_size = 256;
80 
81  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
82  NULL, &global_item_size, &local_item_size,
83  0, NULL, NULL));
84 
85 }
86 
90 void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size) {
91  cl_int err;
92 
93  if (math_program == NULL)
94  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
95 
96  cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
97  CL_CHECK(err);
98 
99  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
100  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
101  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
102  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
103  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
104 
105  const int nb = ((*mask_size) + 256 - 1) / 256;
106  const size_t global_item_size = 256 * nb;
107  const size_t local_item_size = 256;
108 
109  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
110  NULL, &global_item_size, &local_item_size,
111  0, NULL, NULL));
112  }
113 
117 void opencl_rzero(void *a, int *n) {
118  cl_event wait_kern;
119  real zero = 0.0;
120 
121  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
122  a, &zero, sizeof(real), 0,
123  (*n) * sizeof(real), 0, NULL, &wait_kern));
124  CL_CHECK(clWaitForEvents(1, &wait_kern));
125 }
126 
130 void opencl_rone(void *a, int *n) {
131  cl_event wait_kern;
132  real one = 1.0;
133 
134  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
135  a, &one, sizeof(real), 0,
136  (*n) * sizeof(real), 0, NULL, &wait_kern));
137  CL_CHECK(clWaitForEvents(1, &wait_kern));
138 }
139 
143 void opencl_cmult2(void *a, void *b, real *c, int *n) {
144  cl_int err;
145 
146  if (math_program == NULL)
147  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
148 
149  cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
150  CL_CHECK(err);
151 
152  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
153  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
154  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
155  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
156 
157  const int nb = ((*n) + 256 - 1) / 256;
158  const size_t global_item_size = 256 * nb;
159  const size_t local_item_size = 256;
160 
161  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
162  NULL, &global_item_size, &local_item_size,
163  0, NULL, NULL));
164 }
165 
166 
170 void opencl_cmult(void *a, real *c, int *n) {
171  cl_int err;
172 
173  if (math_program == NULL)
174  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
175 
176  cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
177  CL_CHECK(err);
178 
179  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
180  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
181  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
182 
183  const int nb = ((*n) + 256 - 1) / 256;
184  const size_t global_item_size = 256 * nb;
185  const size_t local_item_size = 256;
186 
187  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
188  NULL, &global_item_size, &local_item_size,
189  0, NULL, NULL));
190 }
191 
195 void opencl_cadd(void *a, real *c, int *n) {
196  cl_int err;
197 
198  if (math_program == NULL)
199  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
200 
201  cl_kernel kernel = clCreateKernel(math_program, "cadd_kernel", &err);
202  CL_CHECK(err);
203 
204  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
205  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
206  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
207 
208  const int nb = ((*n) + 256 - 1) / 256;
209  const size_t global_item_size = 256 * nb;
210  const size_t local_item_size = 256;
211 
212  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
213  NULL, &global_item_size, &local_item_size,
214  0, NULL, NULL));
215 }
216 
220 void opencl_cadd2(void *a, void *b, real *c, int *n) {
221  cl_int err;
222 
223  if (math_program == NULL)
224  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
225 
226  cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
227  CL_CHECK(err);
228 
229  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
230  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
231  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
232  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
233 
234  const int nb = ((*n) + 256 - 1) / 256;
235  const size_t global_item_size = 256 * nb;
236  const size_t local_item_size = 256;
237 
238  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
239  NULL, &global_item_size, &local_item_size,
240  0, NULL, NULL));
241 }
242 
246 void opencl_cfill(void *a, real *c, int *n) {
247  cl_int err;
248 
249  if (math_program == NULL)
250  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
251 
252  cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
253  CL_CHECK(err);
254 
255  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
256  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
257  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
258 
259  const int nb = ((*n) + 256 - 1) / 256;
260  const size_t global_item_size = 256 * nb;
261  const size_t local_item_size = 256;
262 
263  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
264  NULL, &global_item_size, &local_item_size,
265  0, NULL, NULL));
266 }
267 
272 void opencl_add2(void *a, void *b, int *n) {
273  cl_int err;
274 
275  if (math_program == NULL)
276  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
277 
278  cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
279  CL_CHECK(err);
280 
281  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
282  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
283  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
284 
285  const int nb = ((*n) + 256 - 1) / 256;
286  const size_t global_item_size = 256 * nb;
287  const size_t local_item_size = 256;
288 
289  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
290  NULL, &global_item_size, &local_item_size,
291  0, NULL, NULL));
292 }
293 
298 void opencl_add3(void *a, void *b, void *c, int *n) {
299  cl_int err;
300 
301  if (math_program == NULL)
302  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
303 
304  cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
305  CL_CHECK(err);
306 
307  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
308  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
309  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
310  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
311 
312  const int nb = ((*n) + 256 - 1) / 256;
313  const size_t global_item_size = 256 * nb;
314  const size_t local_item_size = 256;
315 
316  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
317  NULL, &global_item_size, &local_item_size,
318  0, NULL, NULL));
319 }
320 
326 void opencl_add2s1(void *a, void *b, real *c1, int *n) {
327  cl_int err;
328 
329  if (math_program == NULL)
330  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
331 
332  cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
333  CL_CHECK(err);
334 
335  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
336  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
337  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
338  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
339 
340  const int nb = ((*n) + 256 - 1) / 256;
341  const size_t global_item_size = 256 * nb;
342  const size_t local_item_size = 256;
343 
344  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
345  NULL, &global_item_size, &local_item_size,
346  0, NULL, NULL));
347 }
348 
354 void opencl_add2s2(void *a, void *b, real *c1, int *n) {
355  cl_int err;
356 
357  if (math_program == NULL)
358  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
359 
360  cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
361  CL_CHECK(err);
362 
363  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
364  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
365  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
366  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
367 
368  const int nb = ((*n) + 256 - 1) / 256;
369  const size_t global_item_size = 256 * nb;
370  const size_t local_item_size = 256;
371 
372  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
373  NULL, &global_item_size, &local_item_size,
374  0, NULL, NULL));
375 }
376 
383 void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n) {
384  cl_int err;
385 
386  if (math_program == NULL)
387  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
388 
389  cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
390  CL_CHECK(err);
391 
392  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
393  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
394  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
395  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
396  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
397 
398  const int nb = ((*n) + 256 - 1) / 256;
399  const size_t global_item_size = 256 * nb;
400  const size_t local_item_size = 256;
401 
402  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
403  NULL, &global_item_size, &local_item_size,
404  0, NULL, NULL));
405 
406 }
407 
413 void opencl_addsqr2s2(void *a, void *b, real *c1, int *n) {
414  cl_int err;
415 
416  if (math_program == NULL)
417  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
418 
419  cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
420  CL_CHECK(err);
421 
422  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
423  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
424  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
425  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
426 
427  const int nb = ((*n) + 256 - 1) / 256;
428  const size_t global_item_size = 256 * nb;
429  const size_t local_item_size = 256;
430 
431  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
432  NULL, &global_item_size, &local_item_size,
433  0, NULL, NULL));
434 }
435 
440 void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n) {
441  cl_int err;
442 
443  if (math_program == NULL)
444  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
445 
446  cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
447  CL_CHECK(err);
448 
449  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
450  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
451  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
452  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
453  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
454  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
455 
456  const int nb = ((*n) + 256 - 1) / 256;
457  const size_t global_item_size = 256 * nb;
458  const size_t local_item_size = 256;
459 
460  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
461  NULL, &global_item_size, &local_item_size,
462  0, NULL, NULL));
463 }
464 
469 void opencl_invcol1(void *a, int *n) {
470  cl_int err;
471 
472  if (math_program == NULL)
473  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
474 
475  cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
476 
477  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
478  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
479 
480  const int nb = ((*n) + 256 - 1) / 256;
481  const size_t global_item_size = 256 * nb;
482  const size_t local_item_size = 256;
483 
484  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
485  NULL, &global_item_size, &local_item_size,
486  0, NULL, NULL));
487 }
488 
493 void opencl_invcol2(void *a, void *b, int *n) {
494  cl_int err;
495 
496  if (math_program == NULL)
497  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
498 
499  cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
500  CL_CHECK(err);
501 
502  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
503  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
504  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
505 
506  const int nb = ((*n) + 256 - 1) / 256;
507  const size_t global_item_size = 256 * nb;
508  const size_t local_item_size = 256;
509 
510  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
511  NULL, &global_item_size, &local_item_size,
512  0, NULL, NULL));
513 }
514 
519 void opencl_col2(void *a, void *b, int *n) {
520  cl_int err;
521 
522  if (math_program == NULL)
523  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
524 
525  cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
526  CL_CHECK(err);
527 
528  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
529  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
530  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
531 
532  const int nb = ((*n) + 256 - 1) / 256;
533  const size_t global_item_size = 256 * nb;
534  const size_t local_item_size = 256;
535 
536  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
537  NULL, &global_item_size, &local_item_size,
538  0, NULL, NULL));
539 }
540 
545 void opencl_col3(void *a, void *b, void *c, int *n) {
546  cl_int err;
547 
548  if (math_program == NULL)
549  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
550 
551  cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
552  CL_CHECK(err);
553 
554  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
555  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
556  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
557  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
558 
559  const int nb = ((*n) + 256 - 1) / 256;
560  const size_t global_item_size = 256 * nb;
561  const size_t local_item_size = 256;
562 
563  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
564  NULL, &global_item_size, &local_item_size,
565  0, NULL, NULL));
566 }
567 
572 void opencl_subcol3(void *a, void *b, void *c, int *n) {
573  cl_int err;
574 
575  if (math_program == NULL)
576  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
577 
578  cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
579  CL_CHECK(err);
580 
581  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
582  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
583  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
584  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
585 
586  const int nb = ((*n) + 256 - 1) / 256;
587  const size_t global_item_size = 256 * nb;
588  const size_t local_item_size = 256;
589 
590  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
591  NULL, &global_item_size, &local_item_size,
592  0, NULL, NULL));
593 }
594 
599 void opencl_sub2(void *a, void *b, int *n) {
600  cl_int err;
601 
602  if (math_program == NULL)
603  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
604 
605  cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
606  CL_CHECK(err);
607 
608  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
609  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
610  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
611 
612  const int nb = ((*n) + 256 - 1) / 256;
613  const size_t global_item_size = 256 * nb;
614  const size_t local_item_size = 256;
615 
616  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
617  NULL, &global_item_size, &local_item_size,
618  0, NULL, NULL));
619 }
620 
625 void opencl_sub3(void *a, void *b, void *c, int *n) {
626  cl_int err;
627 
628  if (math_program == NULL)
629  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
630 
631  cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
632  CL_CHECK(err);
633 
634  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
635  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
636  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
637  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
638 
639  const int nb = ((*n) + 256 - 1) / 256;
640  const size_t global_item_size = 256 * nb;
641  const size_t local_item_size = 256;
642 
643  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
644  NULL, &global_item_size, &local_item_size,
645  0, NULL, NULL));
646 }
647 
652 void opencl_addcol3(void *a, void *b, void *c, int *n) {
653  cl_int err;
654 
655  if (math_program == NULL)
656  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
657 
658  cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
659  CL_CHECK(err);
660 
661  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
662  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
663  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
664  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
665 
666  const int nb = ((*n) + 256 - 1) / 256;
667  const size_t global_item_size = 256 * nb;
668  const size_t local_item_size = 256;
669 
670  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
671  NULL, &global_item_size, &local_item_size,
672  0, NULL, NULL));
673 }
674 
679 void opencl_addcol4(void *a, void *b, void *c, void *d, int *n) {
680  cl_int err;
681 
682  if (math_program == NULL)
683  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
684 
685  cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
686  CL_CHECK(err);
687 
688  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
689  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
690  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
691  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
692  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
693 
694  const int nb = ((*n) + 256 - 1) / 256;
695  const size_t global_item_size = 256 * nb;
696  const size_t local_item_size = 256;
697 
698  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
699  NULL, &global_item_size, &local_item_size,
700  0, NULL, NULL));
701 }
702 
708 void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
709  void *v1, void *v2, void *v3, int *n) {
710  cl_int err;
711 
712  if (math_program == NULL)
713  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
714 
715  cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
716  CL_CHECK(err);
717 
718  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
719  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
720  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
721  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
722  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
723  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
724  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
725  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
726 
727  const int nb = ((*n) + 256 - 1) / 256;
728  const size_t global_item_size = 256 * nb;
729  const size_t local_item_size = 256;
730 
731  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
732  NULL, &global_item_size, &local_item_size,
733  0, NULL, NULL));
734 }
735 
737 int red_s = 0;
738 real *bufred = NULL;
739 cl_mem bufred_d = NULL;
740 
745 real opencl_glsc3(void *a, void *b, void *c, int *n) {
746  cl_int err;
747  cl_event kern_wait;
748  int i;
749 
750  if (math_program == NULL)
751  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
752 
753  const int nb = ((*n) + 256 - 1) / 256;
754  const size_t global_item_size = 256 * nb;
755  const size_t local_item_size = 256;
756 
757  if ( nb > red_s){
758  red_s = nb;
759  if (bufred != NULL) {
760  free(bufred);
761  CL_CHECK(clReleaseMemObject(bufred_d));
762  }
763  bufred = (real *) malloc(nb * sizeof(real));
764 
765  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
766  nb * sizeof(real), NULL, &err);
767  CL_CHECK(err);
768  }
769 
770  cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
771  CL_CHECK(err);
772 
773  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
774  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
775  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
776  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
777  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
778 
779  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
780  NULL, &global_item_size, &local_item_size,
781  0, NULL, &kern_wait));
782 
783  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, bufred_d,
784  CL_TRUE, 0, nb * sizeof(real), bufred, 1,
785  &kern_wait, NULL));
786 
787  real res = 0.0;
788  for (i = 0; i < nb; i++) {
789  res += bufred[i];
790  }
791 
792  return res;
793 }
794 
799 void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n){
800  int i, k;
801  cl_int err;
802  cl_event kern_wait;
803 
804  if (math_program == NULL)
805  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
806 
807  int pow2 = 1;
808  while(pow2 < (*j)){
809  pow2 = 2*pow2;
810  }
811 
812  const int nt = 256 / pow2;
813  const int nb = ((*n) + nt - 1) / nt;
814  const size_t local_item_size[2] = {nt, pow2};
815  const size_t global_item_size[2] = {nb * nt, pow2};
816 
817  if((*j)*nb > red_s) {
818  red_s = (*j)*nb;
819  if (bufred != NULL) {
820  free(bufred);
821  CL_CHECK(clReleaseMemObject(bufred_d));
822  }
823  bufred = (real *) malloc((*j) * nb * sizeof(real));
824 
825  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
826  (*j) * nb * sizeof(real), NULL, &err);
827  CL_CHECK(err);
828  }
829 
830  cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
831  CL_CHECK(err);
832 
833  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
834  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
835  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
836  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
837  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
838  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
839 
840  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 2,
841  NULL, global_item_size, local_item_size,
842  0, NULL, &kern_wait));
843 
844  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue,
845  bufred_d, CL_TRUE, 0, (*j) * nb * sizeof(real),
846  bufred, 1, &kern_wait, NULL));
847 
848  for (k = 0; k < (*j); k++) {
849  h[k] = 0.0;
850  }
851 
852  for (i = 0; i < nb; i++) {
853  for (k = 0; k < (*j); k++) {
854  h[k] += bufred[i*(*j)+k];
855  }
856  }
857 }
858 
863 real opencl_glsc2(void *a, void *b, int *n) {
864  cl_int err;
865  cl_event kern_wait;
866  int i;
867 
868  if (math_program == NULL)
869  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
870 
871  const int nb = ((*n) + 256 - 1) / 256;
872  const size_t global_item_size = 256 * nb;
873  const size_t local_item_size = 256;
874 
875  real * buf = (real *) malloc(nb * sizeof(real));
876 
877  cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
878  CL_CHECK(err);
879 
880  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
881  nb * sizeof(real), NULL, &err);
882  CL_CHECK(err);
883 
884  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
885  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
886  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
887  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
888 
889  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
890  NULL, &global_item_size, &local_item_size,
891  0, NULL, &kern_wait));
892 
893 
894  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
895  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
896 
897  real res = 0.0;
898  for (i = 0; i < nb; i++) {
899  res += buf[i];
900  }
901 
902  free(buf);
903  CL_CHECK(clReleaseMemObject(buf_d));
904 
905  return res;
906 }
907 
912 real opencl_glsum(void *a, int *n) {
913  cl_int err;
914  cl_event kern_wait;
915  int i;
916 
917  if (math_program == NULL)
918  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
919 
920  const int nb = ((*n) + 256 - 1) / 256;
921  const size_t global_item_size = 256 * nb;
922  const size_t local_item_size = 256;
923 
924  real * buf = (real *) malloc(nb * sizeof(real));
925 
926  cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
927  CL_CHECK(err);
928 
929  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
930  nb * sizeof(real), NULL, &err);
931  CL_CHECK(err);
932 
933  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
934  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
935  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
936 
937  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
938  NULL, &global_item_size, &local_item_size,
939  0, NULL, &kern_wait));
940 
941 
942  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
943  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
944 
945  real res = 0.0;
946  for (i = 0; i < nb; i++) {
947  res += buf[i];
948  }
949 
950  free(buf);
951  CL_CHECK(clReleaseMemObject(buf_d));
952 
953  return res;
954 }
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
const int i
Definition: cdtp_kernel.h:128
const int j
Definition: cdtp_kernel.h:127
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
double real
Definition: device_config.h:12
void * glb_cmd_queue
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition: jit.c:50
void opencl_add3(void *a, void *b, void *c, int *n)
Definition: math.c:298
void opencl_addcol3(void *a, void *b, void *c, int *n)
Definition: math.c:652
void opencl_invcol1(void *a, int *n)
Definition: math.c:469
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.c:799
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:413
void opencl_cmult(void *a, real *c, int *n)
Definition: math.c:170
void opencl_sub3(void *a, void *b, void *c, int *n)
Definition: math.c:625
void opencl_rone(void *a, int *n)
Definition: math.c:130
void opencl_cadd(void *a, real *c, int *n)
Definition: math.c:195
void opencl_cmult2(void *a, void *b, real *c, int *n)
Definition: math.c:143
real opencl_glsc3(void *a, void *b, void *c, int *n)
Definition: math.c:745
void opencl_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:354
void opencl_rzero(void *a, int *n)
Definition: math.c:117
void opencl_sub2(void *a, void *b, int *n)
Definition: math.c:599
void opencl_col2(void *a, void *b, int *n)
Definition: math.c:519
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.c:679
void opencl_col3(void *a, void *b, void *c, int *n)
Definition: math.c:545
real * bufred
Definition: math.c:738
void opencl_subcol3(void *a, void *b, void *c, int *n)
Definition: math.c:572
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.c:440
int red_s
Definition: math.c:737
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
Definition: math.c:383
void opencl_invcol2(void *a, void *b, int *n)
Definition: math.c:493
cl_mem bufred_d
Definition: math.c:739
void opencl_cadd2(void *a, void *b, real *c, int *n)
Definition: math.c:220
void opencl_add2(void *a, void *b, int *n)
Definition: math.c:272
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.c:62
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size)
Definition: math.c:90
void opencl_cfill(void *a, real *c, int *n)
Definition: math.c:246
void opencl_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.c:326
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.c:708
real opencl_glsc2(void *a, void *b, int *n)
Definition: math.c:863
real opencl_glsum(void *a, int *n)
Definition: math.c:912
void opencl_copy(void *a, void *b, int *n)
Definition: math.c:53
#define CL_CHECK(err)
Definition: check.h:12
real * buf
Definition: pipecg_aux.cu:42
void * math_program