Neko  0.9.99
A portable framework for high-order spectral element flow simulations
math.c
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2024, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #ifdef __APPLE__
36 #include <OpenCL/cl.h>
37 #else
38 #include <CL/cl.h>
39 #endif
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <device/device_config.h>
44 #include <device/opencl/jit.h>
45 #include <device/opencl/prgm_lib.h>
46 #include <device/opencl/check.h>
47 
48 #include "math_kernel.cl.h"
49 
53 void opencl_copy(void *a, void *b, int *n) {
54  CL_CHECK(clEnqueueCopyBuffer((cl_command_queue) glb_cmd_queue,
55  b, a, 0, 0, (*n) * sizeof(real),
56  0, NULL, NULL));
57 }
58 
62 void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
63  cl_int err;
64 
65  if (math_program == NULL)
66  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
67 
68  cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
69  CL_CHECK(err);
70 
71  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
72  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
73  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
74  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
75  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), m));
76 
77  const int nb = ((*n) + 256 - 1) / 256;
78  const size_t global_item_size = 256 * nb;
79  const size_t local_item_size = 256;
80 
81  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
82  NULL, &global_item_size, &local_item_size,
83  0, NULL, NULL));
84 
85 }
86 
90 void opencl_cfill_mask(void* a, void* c, int* size, void* mask, int* mask_size) {
91  cl_int err;
92 
93  if (math_program == NULL)
94  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
95 
96  cl_kernel kernel = clCreateKernel(math_program, "cfill_mask_kernel", &err);
97  CL_CHECK(err);
98 
99  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
100  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
101  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), size));
102  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &mask));
103  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), mask_size));
104 
105  const int nb = ((*mask_size) + 256 - 1) / 256;
106  const size_t global_item_size = 256 * nb;
107  const size_t local_item_size = 256;
108 
109  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
110  NULL, &global_item_size, &local_item_size,
111  0, NULL, NULL));
112  }
113 
117 void opencl_rzero(void *a, int *n) {
118  cl_event wait_kern;
119  real zero = 0.0;
120 
121  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
122  a, &zero, sizeof(real), 0,
123  (*n) * sizeof(real), 0, NULL, &wait_kern));
124  CL_CHECK(clWaitForEvents(1, &wait_kern));
125 }
126 
130 void opencl_rone(void *a, int *n) {
131  cl_event wait_kern;
132  real one = 1.0;
133 
134  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
135  a, &one, sizeof(real), 0,
136  (*n) * sizeof(real), 0, NULL, &wait_kern));
137  CL_CHECK(clWaitForEvents(1, &wait_kern));
138 }
139 
143 void opencl_cmult2(void *a, void *b, real *c, int *n) {
144  cl_int err;
145 
146  if (math_program == NULL)
147  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
148 
149  cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
150  CL_CHECK(err);
151 
152  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
153  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
154  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
155  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
156 
157  const int nb = ((*n) + 256 - 1) / 256;
158  const size_t global_item_size = 256 * nb;
159  const size_t local_item_size = 256;
160 
161  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
162  NULL, &global_item_size, &local_item_size,
163  0, NULL, NULL));
164 }
165 
166 
170 void opencl_cmult(void *a, real *c, int *n) {
171  cl_int err;
172 
173  if (math_program == NULL)
174  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
175 
176  cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
177  CL_CHECK(err);
178 
179  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
180  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
181  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
182 
183  const int nb = ((*n) + 256 - 1) / 256;
184  const size_t global_item_size = 256 * nb;
185  const size_t local_item_size = 256;
186 
187  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
188  NULL, &global_item_size, &local_item_size,
189  0, NULL, NULL));
190 }
191 
195 void opencl_cadd(void *a, real *c, int *n) {
196  cl_int err;
197 
198  if (math_program == NULL)
199  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
200 
201  cl_kernel kernel = clCreateKernel(math_program, "cadd_kernel", &err);
202  CL_CHECK(err);
203 
204  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
205  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
206  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
207 
208  const int nb = ((*n) + 256 - 1) / 256;
209  const size_t global_item_size = 256 * nb;
210  const size_t local_item_size = 256;
211 
212  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
213  NULL, &global_item_size, &local_item_size,
214  0, NULL, NULL));
215 }
216 
220 void opencl_cadd2(void *a, void *b, real *c, int *n) {
221  cl_int err;
222 
223  if (math_program == NULL)
224  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
225 
226  cl_kernel kernel = clCreateKernel(math_program, "cadd2_kernel", &err);
227  CL_CHECK(err);
228 
229  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
230  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
231  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
232  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
233 
234  const int nb = ((*n) + 256 - 1) / 256;
235  const size_t global_item_size = 256 * nb;
236  const size_t local_item_size = 256;
237 
238  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
239  NULL, &global_item_size, &local_item_size,
240  0, NULL, NULL));
241 }
242 
246 void opencl_cfill(void *a, real *c, int *n) {
247  cl_int err;
248 
249  if (math_program == NULL)
250  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
251 
252  cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
253  CL_CHECK(err);
254 
255  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
256  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
257  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
258 
259  const int nb = ((*n) + 256 - 1) / 256;
260  const size_t global_item_size = 256 * nb;
261  const size_t local_item_size = 256;
262 
263  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
264  NULL, &global_item_size, &local_item_size,
265  0, NULL, NULL));
266 }
267 
272 void opencl_add2(void *a, void *b, int *n) {
273  cl_int err;
274 
275  if (math_program == NULL)
276  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
277 
278  cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
279  CL_CHECK(err);
280 
281  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
282  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
283  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
284 
285  const int nb = ((*n) + 256 - 1) / 256;
286  const size_t global_item_size = 256 * nb;
287  const size_t local_item_size = 256;
288 
289  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
290  NULL, &global_item_size, &local_item_size,
291  0, NULL, NULL));
292 }
293 
298 void opencl_add3(void *a, void *b, void *c, int *n) {
299  cl_int err;
300 
301  if (math_program == NULL)
302  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
303 
304  cl_kernel kernel = clCreateKernel(math_program, "add3_kernel", &err);
305  CL_CHECK(err);
306 
307  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
308  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
309  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
310  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
311 
312  const int nb = ((*n) + 256 - 1) / 256;
313  const size_t global_item_size = 256 * nb;
314  const size_t local_item_size = 256;
315 
316  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
317  NULL, &global_item_size, &local_item_size,
318  0, NULL, NULL));
319 }
320 
325 void opencl_add4(void *a, void *b, void *c, void *d, int *n) {
326  cl_int err;
327 
328  if (math_program == NULL)
329  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
330 
331  cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
332  CL_CHECK(err);
333 
334  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
335  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
336  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
337  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
338  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
339 
340  const int nb = ((*n) + 256 - 1) / 256;
341  const size_t global_item_size = 256 * nb;
342  const size_t local_item_size = 256;
343 
344  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
345  NULL, &global_item_size, &local_item_size,
346  0, NULL, NULL));
347 }
348 
354 void opencl_add2s1(void *a, void *b, real *c1, int *n) {
355  cl_int err;
356 
357  if (math_program == NULL)
358  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
359 
360  cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
361  CL_CHECK(err);
362 
363  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
364  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
365  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
366  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
367 
368  const int nb = ((*n) + 256 - 1) / 256;
369  const size_t global_item_size = 256 * nb;
370  const size_t local_item_size = 256;
371 
372  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
373  NULL, &global_item_size, &local_item_size,
374  0, NULL, NULL));
375 }
376 
382 void opencl_add2s2(void *a, void *b, real *c1, int *n) {
383  cl_int err;
384 
385  if (math_program == NULL)
386  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
387 
388  cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
389  CL_CHECK(err);
390 
391  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
392  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
393  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
394  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
395 
396  const int nb = ((*n) + 256 - 1) / 256;
397  const size_t global_item_size = 256 * nb;
398  const size_t local_item_size = 256;
399 
400  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
401  NULL, &global_item_size, &local_item_size,
402  0, NULL, NULL));
403 }
404 
411 void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n) {
412  cl_int err;
413 
414  if (math_program == NULL)
415  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
416 
417  cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
418  CL_CHECK(err);
419 
420  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
421  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
422  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
423  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
424  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
425 
426  const int nb = ((*n) + 256 - 1) / 256;
427  const size_t global_item_size = 256 * nb;
428  const size_t local_item_size = 256;
429 
430  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
431  NULL, &global_item_size, &local_item_size,
432  0, NULL, NULL));
433 
434 }
435 
441 void opencl_addsqr2s2(void *a, void *b, real *c1, int *n) {
442  cl_int err;
443 
444  if (math_program == NULL)
445  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
446 
447  cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
448  CL_CHECK(err);
449 
450  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
451  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
452  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
453  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
454 
455  const int nb = ((*n) + 256 - 1) / 256;
456  const size_t global_item_size = 256 * nb;
457  const size_t local_item_size = 256;
458 
459  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
460  NULL, &global_item_size, &local_item_size,
461  0, NULL, NULL));
462 }
463 
468 void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n) {
469  cl_int err;
470 
471  if (math_program == NULL)
472  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
473 
474  cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
475  CL_CHECK(err);
476 
477  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
478  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
479  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
480  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
481  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
482  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
483 
484  const int nb = ((*n) + 256 - 1) / 256;
485  const size_t global_item_size = 256 * nb;
486  const size_t local_item_size = 256;
487 
488  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
489  NULL, &global_item_size, &local_item_size,
490  0, NULL, NULL));
491 }
492 
497 void opencl_invcol1(void *a, int *n) {
498  cl_int err;
499 
500  if (math_program == NULL)
501  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
502 
503  cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
504 
505  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
506  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
507 
508  const int nb = ((*n) + 256 - 1) / 256;
509  const size_t global_item_size = 256 * nb;
510  const size_t local_item_size = 256;
511 
512  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
513  NULL, &global_item_size, &local_item_size,
514  0, NULL, NULL));
515 }
516 
521 void opencl_invcol2(void *a, void *b, int *n) {
522  cl_int err;
523 
524  if (math_program == NULL)
525  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
526 
527  cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
528  CL_CHECK(err);
529 
530  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
531  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
532  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
533 
534  const int nb = ((*n) + 256 - 1) / 256;
535  const size_t global_item_size = 256 * nb;
536  const size_t local_item_size = 256;
537 
538  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
539  NULL, &global_item_size, &local_item_size,
540  0, NULL, NULL));
541 }
542 
547 void opencl_col2(void *a, void *b, int *n) {
548  cl_int err;
549 
550  if (math_program == NULL)
551  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
552 
553  cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
554  CL_CHECK(err);
555 
556  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
557  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
558  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
559 
560  const int nb = ((*n) + 256 - 1) / 256;
561  const size_t global_item_size = 256 * nb;
562  const size_t local_item_size = 256;
563 
564  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
565  NULL, &global_item_size, &local_item_size,
566  0, NULL, NULL));
567 }
568 
573 void opencl_col3(void *a, void *b, void *c, int *n) {
574  cl_int err;
575 
576  if (math_program == NULL)
577  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
578 
579  cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
580  CL_CHECK(err);
581 
582  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
583  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
584  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
585  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
586 
587  const int nb = ((*n) + 256 - 1) / 256;
588  const size_t global_item_size = 256 * nb;
589  const size_t local_item_size = 256;
590 
591  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
592  NULL, &global_item_size, &local_item_size,
593  0, NULL, NULL));
594 }
595 
600 void opencl_subcol3(void *a, void *b, void *c, int *n) {
601  cl_int err;
602 
603  if (math_program == NULL)
604  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
605 
606  cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
607  CL_CHECK(err);
608 
609  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
610  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
611  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
612  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
613 
614  const int nb = ((*n) + 256 - 1) / 256;
615  const size_t global_item_size = 256 * nb;
616  const size_t local_item_size = 256;
617 
618  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
619  NULL, &global_item_size, &local_item_size,
620  0, NULL, NULL));
621 }
622 
627 void opencl_sub2(void *a, void *b, int *n) {
628  cl_int err;
629 
630  if (math_program == NULL)
631  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
632 
633  cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
634  CL_CHECK(err);
635 
636  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
637  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
638  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
639 
640  const int nb = ((*n) + 256 - 1) / 256;
641  const size_t global_item_size = 256 * nb;
642  const size_t local_item_size = 256;
643 
644  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
645  NULL, &global_item_size, &local_item_size,
646  0, NULL, NULL));
647 }
648 
653 void opencl_sub3(void *a, void *b, void *c, int *n) {
654  cl_int err;
655 
656  if (math_program == NULL)
657  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
658 
659  cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
660  CL_CHECK(err);
661 
662  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
663  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
664  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
665  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
666 
667  const int nb = ((*n) + 256 - 1) / 256;
668  const size_t global_item_size = 256 * nb;
669  const size_t local_item_size = 256;
670 
671  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
672  NULL, &global_item_size, &local_item_size,
673  0, NULL, NULL));
674 }
675 
680 void opencl_addcol3(void *a, void *b, void *c, int *n) {
681  cl_int err;
682 
683  if (math_program == NULL)
684  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
685 
686  cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
687  CL_CHECK(err);
688 
689  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
690  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
691  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
692  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
693 
694  const int nb = ((*n) + 256 - 1) / 256;
695  const size_t global_item_size = 256 * nb;
696  const size_t local_item_size = 256;
697 
698  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
699  NULL, &global_item_size, &local_item_size,
700  0, NULL, NULL));
701 }
702 
707 void opencl_addcol4(void *a, void *b, void *c, void *d, int *n) {
708  cl_int err;
709 
710  if (math_program == NULL)
711  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
712 
713  cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
714  CL_CHECK(err);
715 
716  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
717  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
718  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
719  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
720  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
721 
722  const int nb = ((*n) + 256 - 1) / 256;
723  const size_t global_item_size = 256 * nb;
724  const size_t local_item_size = 256;
725 
726  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
727  NULL, &global_item_size, &local_item_size,
728  0, NULL, NULL));
729 }
730 
736 void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
737  void *v1, void *v2, void *v3, int *n) {
738  cl_int err;
739 
740  if (math_program == NULL)
741  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
742 
743  cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
744  CL_CHECK(err);
745 
746  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
747  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
748  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
749  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
750  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
751  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
752  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
753  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
754 
755  const int nb = ((*n) + 256 - 1) / 256;
756  const size_t global_item_size = 256 * nb;
757  const size_t local_item_size = 256;
758 
759  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
760  NULL, &global_item_size, &local_item_size,
761  0, NULL, NULL));
762 }
763 
765 int red_s = 0;
766 real *bufred = NULL;
767 cl_mem bufred_d = NULL;
768 
773 real opencl_glsc3(void *a, void *b, void *c, int *n) {
774  cl_int err;
775  cl_event kern_wait;
776  int i;
777 
778  if (math_program == NULL)
779  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
780 
781  const int nb = ((*n) + 256 - 1) / 256;
782  const size_t global_item_size = 256 * nb;
783  const size_t local_item_size = 256;
784 
785  if ( nb > red_s){
786  red_s = nb;
787  if (bufred != NULL) {
788  free(bufred);
789  CL_CHECK(clReleaseMemObject(bufred_d));
790  }
791  bufred = (real *) malloc(nb * sizeof(real));
792 
793  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
794  nb * sizeof(real), NULL, &err);
795  CL_CHECK(err);
796  }
797 
798  cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
799  CL_CHECK(err);
800 
801  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
802  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
803  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
804  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
805  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
806 
807  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
808  NULL, &global_item_size, &local_item_size,
809  0, NULL, &kern_wait));
810 
811  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, bufred_d,
812  CL_TRUE, 0, nb * sizeof(real), bufred, 1,
813  &kern_wait, NULL));
814 
815  real res = 0.0;
816  for (i = 0; i < nb; i++) {
817  res += bufred[i];
818  }
819 
820  return res;
821 }
822 
827 void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n){
828  int i, k;
829  cl_int err;
830  cl_event kern_wait;
831 
832  if (math_program == NULL)
833  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
834 
835  int pow2 = 1;
836  while(pow2 < (*j)){
837  pow2 = 2*pow2;
838  }
839 
840  const int nt = 256 / pow2;
841  const int nb = ((*n) + nt - 1) / nt;
842  const size_t local_item_size[2] = {nt, pow2};
843  const size_t global_item_size[2] = {nb * nt, pow2};
844 
845  if((*j)*nb > red_s) {
846  red_s = (*j)*nb;
847  if (bufred != NULL) {
848  free(bufred);
849  CL_CHECK(clReleaseMemObject(bufred_d));
850  }
851  bufred = (real *) malloc((*j) * nb * sizeof(real));
852 
853  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
854  (*j) * nb * sizeof(real), NULL, &err);
855  CL_CHECK(err);
856  }
857 
858  cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
859  CL_CHECK(err);
860 
861  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
862  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
863  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
864  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
865  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
866  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
867 
868  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 2,
869  NULL, global_item_size, local_item_size,
870  0, NULL, &kern_wait));
871 
872  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue,
873  bufred_d, CL_TRUE, 0, (*j) * nb * sizeof(real),
874  bufred, 1, &kern_wait, NULL));
875 
876  for (k = 0; k < (*j); k++) {
877  h[k] = 0.0;
878  }
879 
880  for (i = 0; i < nb; i++) {
881  for (k = 0; k < (*j); k++) {
882  h[k] += bufred[i*(*j)+k];
883  }
884  }
885 }
886 
891 real opencl_glsc2(void *a, void *b, int *n) {
892  cl_int err;
893  cl_event kern_wait;
894  int i;
895 
896  if (math_program == NULL)
897  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
898 
899  const int nb = ((*n) + 256 - 1) / 256;
900  const size_t global_item_size = 256 * nb;
901  const size_t local_item_size = 256;
902 
903  real * buf = (real *) malloc(nb * sizeof(real));
904 
905  cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
906  CL_CHECK(err);
907 
908  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
909  nb * sizeof(real), NULL, &err);
910  CL_CHECK(err);
911 
912  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
913  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
914  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
915  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
916 
917  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
918  NULL, &global_item_size, &local_item_size,
919  0, NULL, &kern_wait));
920 
921 
922  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
923  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
924 
925  real res = 0.0;
926  for (i = 0; i < nb; i++) {
927  res += buf[i];
928  }
929 
930  free(buf);
931  CL_CHECK(clReleaseMemObject(buf_d));
932 
933  return res;
934 }
935 
940 real opencl_glsum(void *a, int *n) {
941  cl_int err;
942  cl_event kern_wait;
943  int i;
944 
945  if (math_program == NULL)
946  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
947 
948  const int nb = ((*n) + 256 - 1) / 256;
949  const size_t global_item_size = 256 * nb;
950  const size_t local_item_size = 256;
951 
952  real * buf = (real *) malloc(nb * sizeof(real));
953 
954  cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
955  CL_CHECK(err);
956 
957  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
958  nb * sizeof(real), NULL, &err);
959  CL_CHECK(err);
960 
961  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
962  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
963  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
964 
965  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
966  NULL, &global_item_size, &local_item_size,
967  0, NULL, &kern_wait));
968 
969 
970  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
971  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
972 
973  real res = 0.0;
974  for (i = 0; i < nb; i++) {
975  res += buf[i];
976  }
977 
978  free(buf);
979  CL_CHECK(clReleaseMemObject(buf_d));
980 
981  return res;
982 }
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
const int i
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
double real
Definition: device_config.h:12
void * glb_cmd_queue
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition: jit.c:50
void opencl_add3(void *a, void *b, void *c, int *n)
Definition: math.c:298
void opencl_addcol3(void *a, void *b, void *c, int *n)
Definition: math.c:680
void opencl_invcol1(void *a, int *n)
Definition: math.c:497
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.c:827
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:441
void opencl_cmult(void *a, real *c, int *n)
Definition: math.c:170
void opencl_sub3(void *a, void *b, void *c, int *n)
Definition: math.c:653
void opencl_rone(void *a, int *n)
Definition: math.c:130
void opencl_cadd(void *a, real *c, int *n)
Definition: math.c:195
void opencl_cmult2(void *a, void *b, real *c, int *n)
Definition: math.c:143
void opencl_add4(void *a, void *b, void *c, void *d, int *n)
Definition: math.c:325
real opencl_glsc3(void *a, void *b, void *c, int *n)
Definition: math.c:773
void opencl_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:382
void opencl_rzero(void *a, int *n)
Definition: math.c:117
void opencl_sub2(void *a, void *b, int *n)
Definition: math.c:627
void opencl_col2(void *a, void *b, int *n)
Definition: math.c:547
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.c:707
void opencl_col3(void *a, void *b, void *c, int *n)
Definition: math.c:573
real * bufred
Definition: math.c:766
void opencl_subcol3(void *a, void *b, void *c, int *n)
Definition: math.c:600
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.c:468
int red_s
Definition: math.c:765
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
Definition: math.c:411
void opencl_invcol2(void *a, void *b, int *n)
Definition: math.c:521
cl_mem bufred_d
Definition: math.c:767
void opencl_cadd2(void *a, void *b, real *c, int *n)
Definition: math.c:220
void opencl_add2(void *a, void *b, int *n)
Definition: math.c:272
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.c:62
void opencl_cfill_mask(void *a, void *c, int *size, void *mask, int *mask_size)
Definition: math.c:90
void opencl_cfill(void *a, real *c, int *n)
Definition: math.c:246
void opencl_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.c:354
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.c:736
real opencl_glsc2(void *a, void *b, int *n)
Definition: math.c:891
real opencl_glsum(void *a, int *n)
Definition: math.c:940
void opencl_copy(void *a, void *b, int *n)
Definition: math.c:53
#define CL_CHECK(err)
Definition: check.h:12
real * buf
Definition: pipecg_aux.cu:42
void * math_program