Neko  0.8.1
A portable framework for high-order spectral element flow simulations
math.c
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2024, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #ifdef __APPLE__
36 #include <OpenCL/cl.h>
37 #else
38 #include <CL/cl.h>
39 #endif
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <device/device_config.h>
44 #include <device/opencl/jit.h>
45 #include <device/opencl/prgm_lib.h>
46 #include <device/opencl/check.h>
47 
48 #include "math_kernel.cl.h"
49 
53 void opencl_copy(void *a, void *b, int *n) {
54  CL_CHECK(clEnqueueCopyBuffer((cl_command_queue) glb_cmd_queue,
55  b, a, 0, 0, (*n) * sizeof(real),
56  0, NULL, NULL));
57 }
58 
62 void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
63  cl_int err;
64 
65  if (math_program == NULL)
66  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
67 
68  cl_kernel kernel = clCreateKernel(math_program, "masked_copy_kernel", &err);
69  CL_CHECK(err);
70 
71  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
72  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
73  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mask));
74  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
75  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), m));
76 
77  const int nb = ((*n) + 256 - 1) / 256;
78  const size_t global_item_size = 256 * nb;
79  const size_t local_item_size = 256;
80 
81  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
82  NULL, &global_item_size, &local_item_size,
83  0, NULL, NULL));
84 
85 }
86 
90 void opencl_rzero(void *a, int *n) {
91  cl_event wait_kern;
92  real zero = 0.0;
93 
94  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
95  a, &zero, sizeof(real), 0,
96  (*n) * sizeof(real), 0, NULL, &wait_kern));
97  CL_CHECK(clWaitForEvents(1, &wait_kern));
98 }
99 
103 void opencl_rone(void *a, int *n) {
104  cl_event wait_kern;
105  real one = 1.0;
106 
107  CL_CHECK(clEnqueueFillBuffer((cl_command_queue) glb_cmd_queue,
108  a, &one, sizeof(real), 0,
109  (*n) * sizeof(real), 0, NULL, &wait_kern));
110  CL_CHECK(clWaitForEvents(1, &wait_kern));
111 }
112 
116 void opencl_cmult2(void *a, void *b, real *c, int *n) {
117  cl_int err;
118 
119  if (math_program == NULL)
120  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
121 
122  cl_kernel kernel = clCreateKernel(math_program, "cmult2_kernel", &err);
123  CL_CHECK(err);
124 
125  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
126  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
127  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c));
128  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
129 
130  const int nb = ((*n) + 256 - 1) / 256;
131  const size_t global_item_size = 256 * nb;
132  const size_t local_item_size = 256;
133 
134  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
135  NULL, &global_item_size, &local_item_size,
136  0, NULL, NULL));
137 }
138 
139 
143 void opencl_cmult(void *a, real *c, int *n) {
144  cl_int err;
145 
146  if (math_program == NULL)
147  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
148 
149  cl_kernel kernel = clCreateKernel(math_program, "cmult_kernel", &err);
150  CL_CHECK(err);
151 
152  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
153  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
154  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
155 
156  const int nb = ((*n) + 256 - 1) / 256;
157  const size_t global_item_size = 256 * nb;
158  const size_t local_item_size = 256;
159 
160  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
161  NULL, &global_item_size, &local_item_size,
162  0, NULL, NULL));
163 }
164 
168 void opencl_cadd(void *a, real *c, int *n) {
169  cl_int err;
170 
171  if (math_program == NULL)
172  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
173 
174  cl_kernel kernel = clCreateKernel(math_program, "cadd_kernel", &err);
175  CL_CHECK(err);
176 
177  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
178  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
179  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
180 
181  const int nb = ((*n) + 256 - 1) / 256;
182  const size_t global_item_size = 256 * nb;
183  const size_t local_item_size = 256;
184 
185  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
186  NULL, &global_item_size, &local_item_size,
187  0, NULL, NULL));
188 }
189 
193 void opencl_cfill(void *a, real *c, int *n) {
194  cl_int err;
195 
196  if (math_program == NULL)
197  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
198 
199  cl_kernel kernel = clCreateKernel(math_program, "cfill_kernel", &err);
200  CL_CHECK(err);
201 
202  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
203  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), c));
204  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
205 
206  const int nb = ((*n) + 256 - 1) / 256;
207  const size_t global_item_size = 256 * nb;
208  const size_t local_item_size = 256;
209 
210  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
211  NULL, &global_item_size, &local_item_size,
212  0, NULL, NULL));
213 }
214 
219 void opencl_add2(void *a, void *b, int *n) {
220  cl_int err;
221 
222  if (math_program == NULL)
223  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
224 
225  cl_kernel kernel = clCreateKernel(math_program, "add2_kernel", &err);
226  CL_CHECK(err);
227 
228  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
229  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
230  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
231 
232  const int nb = ((*n) + 256 - 1) / 256;
233  const size_t global_item_size = 256 * nb;
234  const size_t local_item_size = 256;
235 
236  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
237  NULL, &global_item_size, &local_item_size,
238  0, NULL, NULL));
239 }
240 
246 void opencl_add2s1(void *a, void *b, real *c1, int *n) {
247  cl_int err;
248 
249  if (math_program == NULL)
250  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
251 
252  cl_kernel kernel = clCreateKernel(math_program, "add2s1_kernel", &err);
253  CL_CHECK(err);
254 
255  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
256  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
257  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
258  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
259 
260  const int nb = ((*n) + 256 - 1) / 256;
261  const size_t global_item_size = 256 * nb;
262  const size_t local_item_size = 256;
263 
264  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
265  NULL, &global_item_size, &local_item_size,
266  0, NULL, NULL));
267 }
268 
274 void opencl_add2s2(void *a, void *b, real *c1, int *n) {
275  cl_int err;
276 
277  if (math_program == NULL)
278  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
279 
280  cl_kernel kernel = clCreateKernel(math_program, "add2s2_kernel", &err);
281  CL_CHECK(err);
282 
283  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
284  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
285  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
286  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
287 
288  const int nb = ((*n) + 256 - 1) / 256;
289  const size_t global_item_size = 256 * nb;
290  const size_t local_item_size = 256;
291 
292  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
293  NULL, &global_item_size, &local_item_size,
294  0, NULL, NULL));
295 }
296 
303 void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n) {
304  cl_int err;
305 
306  if (math_program == NULL)
307  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
308 
309  cl_kernel kernel = clCreateKernel(math_program, "add2s2_many_kernel", &err);
310  CL_CHECK(err);
311 
312  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &x));
313  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &p));
314  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &alpha));
315  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), j));
316  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
317 
318  const int nb = ((*n) + 256 - 1) / 256;
319  const size_t global_item_size = 256 * nb;
320  const size_t local_item_size = 256;
321 
322  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
323  NULL, &global_item_size, &local_item_size,
324  0, NULL, NULL));
325 
326 }
327 
333 void opencl_addsqr2s2(void *a, void *b, real *c1, int *n) {
334  cl_int err;
335 
336  if (math_program == NULL)
337  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
338 
339  cl_kernel kernel = clCreateKernel(math_program, "addsqr2s2_kernel", &err);
340  CL_CHECK(err);
341 
342  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
343  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
344  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), c1));
345  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
346 
347  const int nb = ((*n) + 256 - 1) / 256;
348  const size_t global_item_size = 256 * nb;
349  const size_t local_item_size = 256;
350 
351  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
352  NULL, &global_item_size, &local_item_size,
353  0, NULL, NULL));
354 }
355 
360 void opencl_add3s2(void *a, void *b, void * c, real *c1, real *c2, int *n) {
361  cl_int err;
362 
363  if (math_program == NULL)
364  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
365 
366  cl_kernel kernel = clCreateKernel(math_program, "add3s2_kernel", &err);
367  CL_CHECK(err);
368 
369  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
370  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
371  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
372  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), c1));
373  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(real), c2));
374  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
375 
376  const int nb = ((*n) + 256 - 1) / 256;
377  const size_t global_item_size = 256 * nb;
378  const size_t local_item_size = 256;
379 
380  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
381  NULL, &global_item_size, &local_item_size,
382  0, NULL, NULL));
383 }
384 
389 void opencl_invcol1(void *a, int *n) {
390  cl_int err;
391 
392  if (math_program == NULL)
393  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
394 
395  cl_kernel kernel = clCreateKernel(math_program, "invcol1_kernel", &err);
396 
397  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
398  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), n));
399 
400  const int nb = ((*n) + 256 - 1) / 256;
401  const size_t global_item_size = 256 * nb;
402  const size_t local_item_size = 256;
403 
404  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
405  NULL, &global_item_size, &local_item_size,
406  0, NULL, NULL));
407 }
408 
413 void opencl_invcol2(void *a, void *b, int *n) {
414  cl_int err;
415 
416  if (math_program == NULL)
417  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
418 
419  cl_kernel kernel = clCreateKernel(math_program, "invcol2_kernel", &err);
420  CL_CHECK(err);
421 
422  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
423  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
424  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
425 
426  const int nb = ((*n) + 256 - 1) / 256;
427  const size_t global_item_size = 256 * nb;
428  const size_t local_item_size = 256;
429 
430  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
431  NULL, &global_item_size, &local_item_size,
432  0, NULL, NULL));
433 }
434 
439 void opencl_col2(void *a, void *b, int *n) {
440  cl_int err;
441 
442  if (math_program == NULL)
443  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
444 
445  cl_kernel kernel = clCreateKernel(math_program, "col2_kernel", &err);
446  CL_CHECK(err);
447 
448  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
449  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
450  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
451 
452  const int nb = ((*n) + 256 - 1) / 256;
453  const size_t global_item_size = 256 * nb;
454  const size_t local_item_size = 256;
455 
456  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
457  NULL, &global_item_size, &local_item_size,
458  0, NULL, NULL));
459 }
460 
465 void opencl_col3(void *a, void *b, void *c, int *n) {
466  cl_int err;
467 
468  if (math_program == NULL)
469  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
470 
471  cl_kernel kernel = clCreateKernel(math_program, "col3_kernel", &err);
472  CL_CHECK(err);
473 
474  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
475  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
476  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
477  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
478 
479  const int nb = ((*n) + 256 - 1) / 256;
480  const size_t global_item_size = 256 * nb;
481  const size_t local_item_size = 256;
482 
483  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
484  NULL, &global_item_size, &local_item_size,
485  0, NULL, NULL));
486 }
487 
492 void opencl_subcol3(void *a, void *b, void *c, int *n) {
493  cl_int err;
494 
495  if (math_program == NULL)
496  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
497 
498  cl_kernel kernel = clCreateKernel(math_program, "subcol3_kernel", &err);
499  CL_CHECK(err);
500 
501  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
502  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
503  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
504  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
505 
506  const int nb = ((*n) + 256 - 1) / 256;
507  const size_t global_item_size = 256 * nb;
508  const size_t local_item_size = 256;
509 
510  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
511  NULL, &global_item_size, &local_item_size,
512  0, NULL, NULL));
513 }
514 
519 void opencl_sub2(void *a, void *b, int *n) {
520  cl_int err;
521 
522  if (math_program == NULL)
523  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
524 
525  cl_kernel kernel = clCreateKernel(math_program, "sub2_kernel", &err);
526  CL_CHECK(err);
527 
528  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
529  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
530  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
531 
532  const int nb = ((*n) + 256 - 1) / 256;
533  const size_t global_item_size = 256 * nb;
534  const size_t local_item_size = 256;
535 
536  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
537  NULL, &global_item_size, &local_item_size,
538  0, NULL, NULL));
539 }
540 
545 void opencl_sub3(void *a, void *b, void *c, int *n) {
546  cl_int err;
547 
548  if (math_program == NULL)
549  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
550 
551  cl_kernel kernel = clCreateKernel(math_program, "sub3_kernel", &err);
552  CL_CHECK(err);
553 
554  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
555  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
556  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
557  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
558 
559  const int nb = ((*n) + 256 - 1) / 256;
560  const size_t global_item_size = 256 * nb;
561  const size_t local_item_size = 256;
562 
563  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
564  NULL, &global_item_size, &local_item_size,
565  0, NULL, NULL));
566 }
567 
572 void opencl_addcol3(void *a, void *b, void *c, int *n) {
573  cl_int err;
574 
575  if (math_program == NULL)
576  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
577 
578  cl_kernel kernel = clCreateKernel(math_program, "addcol3_kernel", &err);
579  CL_CHECK(err);
580 
581  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
582  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
583  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
584  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
585 
586  const int nb = ((*n) + 256 - 1) / 256;
587  const size_t global_item_size = 256 * nb;
588  const size_t local_item_size = 256;
589 
590  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
591  NULL, &global_item_size, &local_item_size,
592  0, NULL, NULL));
593 }
594 
599 void opencl_addcol4(void *a, void *b, void *c, void *d, int *n) {
600  cl_int err;
601 
602  if (math_program == NULL)
603  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
604 
605  cl_kernel kernel = clCreateKernel(math_program, "addcol4_kernel", &err);
606  CL_CHECK(err);
607 
608  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
609  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
610  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
611  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
612  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
613 
614  const int nb = ((*n) + 256 - 1) / 256;
615  const size_t global_item_size = 256 * nb;
616  const size_t local_item_size = 256;
617 
618  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
619  NULL, &global_item_size, &local_item_size,
620  0, NULL, NULL));
621 }
622 
628 void opencl_vdot3(void *dot, void *u1, void *u2, void *u3,
629  void *v1, void *v2, void *v3, int *n) {
630  cl_int err;
631 
632  if (math_program == NULL)
633  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
634 
635  cl_kernel kernel = clCreateKernel(math_program, "vdot3_kernel", &err);
636  CL_CHECK(err);
637 
638  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &dot));
639  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &u1));
640  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &u2));
641  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &u3));
642  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &v1));
643  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &v2));
644  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &v3));
645  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), n));
646 
647  const int nb = ((*n) + 256 - 1) / 256;
648  const size_t global_item_size = 256 * nb;
649  const size_t local_item_size = 256;
650 
651  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
652  NULL, &global_item_size, &local_item_size,
653  0, NULL, NULL));
654 }
655 
657 int red_s = 0;
658 real *bufred = NULL;
659 cl_mem bufred_d = NULL;
660 
665 real opencl_glsc3(void *a, void *b, void *c, int *n) {
666  cl_int err;
667  cl_event kern_wait;
668  int i;
669 
670  if (math_program == NULL)
671  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
672 
673  const int nb = ((*n) + 256 - 1) / 256;
674  const size_t global_item_size = 256 * nb;
675  const size_t local_item_size = 256;
676 
677  if ( nb > red_s){
678  red_s = nb;
679  if (bufred != NULL) {
680  free(bufred);
681  CL_CHECK(clReleaseMemObject(bufred_d));
682  }
683  bufred = (real *) malloc(nb * sizeof(real));
684 
685  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
686  nb * sizeof(real), NULL, &err);
687  CL_CHECK(err);
688  }
689 
690  cl_kernel kernel = clCreateKernel(math_program, "glsc3_kernel", &err);
691  CL_CHECK(err);
692 
693  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
694  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
695  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
696  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
697  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
698 
699  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
700  NULL, &global_item_size, &local_item_size,
701  0, NULL, &kern_wait));
702 
703  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, bufred_d,
704  CL_TRUE, 0, nb * sizeof(real), bufred, 1,
705  &kern_wait, NULL));
706 
707  real res = 0.0;
708  for (i = 0; i < nb; i++) {
709  res += bufred[i];
710  }
711 
712  return res;
713 }
714 
719 void opencl_glsc3_many(real *h, void * w, void *v, void *mult, int *j, int *n){
720  int i, k;
721  cl_int err;
722  cl_event kern_wait;
723 
724  if (math_program == NULL)
725  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
726 
727  int pow2 = 1;
728  while(pow2 < (*j)){
729  pow2 = 2*pow2;
730  }
731 
732  const int nt = 256 / pow2;
733  const int nb = ((*n) + nt - 1) / nt;
734  const size_t local_item_size[2] = {nt, pow2};
735  const size_t global_item_size[2] = {nb * nt, pow2};
736 
737  if((*j)*nb > red_s) {
738  red_s = (*j)*nb;
739  if (bufred != NULL) {
740  free(bufred);
741  CL_CHECK(clReleaseMemObject(bufred_d));
742  }
743  bufred = (real *) malloc((*j) * nb * sizeof(real));
744 
745  bufred_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
746  (*j) * nb * sizeof(real), NULL, &err);
747  CL_CHECK(err);
748  }
749 
750  cl_kernel kernel = clCreateKernel(math_program, "glsc3_many_kernel", &err);
751  CL_CHECK(err);
752 
753  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &w));
754  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &v));
755  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &mult));
756  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &bufred_d));
757  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), j));
758  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), n));
759 
760  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 2,
761  NULL, global_item_size, local_item_size,
762  0, NULL, &kern_wait));
763 
764  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue,
765  bufred_d, CL_TRUE, 0, (*j) * nb * sizeof(real),
766  bufred, 1, &kern_wait, NULL));
767 
768  for (k = 0; k < (*j); k++) {
769  h[k] = 0.0;
770  }
771 
772  for (i = 0; i < nb; i++) {
773  for (k = 0; k < (*j); k++) {
774  h[k] += bufred[i*(*j)+k];
775  }
776  }
777 }
778 
783 real opencl_glsc2(void *a, void *b, int *n) {
784  cl_int err;
785  cl_event kern_wait;
786  int i;
787 
788  if (math_program == NULL)
789  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
790 
791  const int nb = ((*n) + 256 - 1) / 256;
792  const size_t global_item_size = 256 * nb;
793  const size_t local_item_size = 256;
794 
795  real * buf = (real *) malloc(nb * sizeof(real));
796 
797  cl_kernel kernel = clCreateKernel(math_program, "glsc2_kernel", &err);
798  CL_CHECK(err);
799 
800  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
801  nb * sizeof(real), NULL, &err);
802  CL_CHECK(err);
803 
804  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
805  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
806  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &buf_d));
807  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
808 
809  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
810  NULL, &global_item_size, &local_item_size,
811  0, NULL, &kern_wait));
812 
813 
814  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
815  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
816 
817  real res = 0.0;
818  for (i = 0; i < nb; i++) {
819  res += buf[i];
820  }
821 
822  free(buf);
823  CL_CHECK(clReleaseMemObject(buf_d));
824 
825  return res;
826 }
827 
832 real opencl_glsum(void *a, int *n) {
833  cl_int err;
834  cl_event kern_wait;
835  int i;
836 
837  if (math_program == NULL)
838  opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
839 
840  const int nb = ((*n) + 256 - 1) / 256;
841  const size_t global_item_size = 256 * nb;
842  const size_t local_item_size = 256;
843 
844  real * buf = (real *) malloc(nb * sizeof(real));
845 
846  cl_kernel kernel = clCreateKernel(math_program, "glsum_kernel", &err);
847  CL_CHECK(err);
848 
849  cl_mem buf_d = clCreateBuffer(glb_ctx, CL_MEM_READ_WRITE,
850  nb * sizeof(real), NULL, &err);
851  CL_CHECK(err);
852 
853  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
854  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &buf_d));
855  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), n));
856 
857  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
858  NULL, &global_item_size, &local_item_size,
859  0, NULL, &kern_wait));
860 
861 
862  CL_CHECK(clEnqueueReadBuffer((cl_command_queue) glb_cmd_queue, buf_d, CL_TRUE,
863  0, nb * sizeof(real), buf, 1, &kern_wait, NULL));
864 
865  real res = 0.0;
866  for (i = 0; i < nb; i++) {
867  res += buf[i];
868  }
869 
870  free(buf);
871  CL_CHECK(clReleaseMemObject(buf_d));
872 
873  return res;
874 }
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:109
const int i
Definition: cdtp_kernel.h:132
const int j
Definition: cdtp_kernel.h:131
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
double real
Definition: device_config.h:12
void * glb_cmd_queue
void * glb_ctx
void opencl_kernel_jit(const char *kernel, cl_program *program)
Definition: jit.c:50
void opencl_addcol3(void *a, void *b, void *c, int *n)
Definition: math.c:572
void opencl_invcol1(void *a, int *n)
Definition: math.c:389
void opencl_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.c:719
void opencl_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:333
void opencl_cmult(void *a, real *c, int *n)
Definition: math.c:143
void opencl_sub3(void *a, void *b, void *c, int *n)
Definition: math.c:545
void opencl_rone(void *a, int *n)
Definition: math.c:103
void opencl_cadd(void *a, real *c, int *n)
Definition: math.c:168
void opencl_cmult2(void *a, void *b, real *c, int *n)
Definition: math.c:116
real opencl_glsc3(void *a, void *b, void *c, int *n)
Definition: math.c:665
void opencl_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.c:274
void opencl_rzero(void *a, int *n)
Definition: math.c:90
void opencl_sub2(void *a, void *b, int *n)
Definition: math.c:519
void opencl_col2(void *a, void *b, int *n)
Definition: math.c:439
void opencl_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.c:599
void opencl_col3(void *a, void *b, void *c, int *n)
Definition: math.c:465
real * bufred
Definition: math.c:658
void opencl_subcol3(void *a, void *b, void *c, int *n)
Definition: math.c:492
void opencl_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.c:360
int red_s
Definition: math.c:657
void opencl_add2s2_many(void *x, void *p, void *alpha, int *j, int *n)
Definition: math.c:303
void opencl_invcol2(void *a, void *b, int *n)
Definition: math.c:413
cl_mem bufred_d
Definition: math.c:659
void opencl_add2(void *a, void *b, int *n)
Definition: math.c:219
void opencl_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.c:62
void opencl_cfill(void *a, real *c, int *n)
Definition: math.c:193
void opencl_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.c:246
void opencl_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.c:628
real opencl_glsc2(void *a, void *b, int *n)
Definition: math.c:783
real opencl_glsum(void *a, int *n)
Definition: math.c:832
void opencl_copy(void *a, void *b, int *n)
Definition: math.c:53
#define CL_CHECK(err)
Definition: check.h:12
real * buf
Definition: pipecg_aux.cu:42
void * math_program