Neko  0.9.99
A portable framework for high-order spectral element flow simulations
math.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include "math_kernel.h"
36 #include <device/device_config.h>
37 #include <device/cuda/check.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 
41 extern "C" {
42 
45 
49  void cuda_copy(void *a, void *b, int *n) {
50  CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
51  cudaMemcpyDeviceToDevice,
52  (cudaStream_t) glb_cmd_queue));
53  }
54 
58  void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
59 
60  const dim3 nthrds(1024, 1, 1);
61  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 
63  masked_copy_kernel<real><<<nblcks, nthrds, 0,
64  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
65  CUDA_CHECK(cudaGetLastError());
66 
67  }
68 
72  void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
73 
74  const dim3 nthrds(1024, 1, 1);
75  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
76 
77  masked_red_copy_kernel<real><<<nblcks, nthrds, 0,
78  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
79  CUDA_CHECK(cudaGetLastError());
80 
81  }
82 
86  void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size) {
87 
88  const dim3 nthrds(1024, 1, 1);
89  const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
90 
91  cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
92  (real*)a, *c, *size, mask, *mask_size);
93  CUDA_CHECK(cudaGetLastError());
94  }
95 
99  void cuda_rzero(void *a, int *n) {
100  CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real),
101  (cudaStream_t) glb_cmd_queue));
102  }
103 
107  void cuda_cmult(void *a, real *c, int *n) {
108 
109  const dim3 nthrds(1024, 1, 1);
110  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
111 
112  cmult_kernel<real><<<nblcks, nthrds, 0,
113  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
114  CUDA_CHECK(cudaGetLastError());
115 
116  }
117 
121  void cuda_cmult2(void *a, void *b, real *c, int *n) {
122 
123  const dim3 nthrds(1024, 1, 1);
124  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
125 
126  cmult2_kernel<real><<<nblcks, nthrds, 0,
127  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
128  CUDA_CHECK(cudaGetLastError());
129 
130  }
131 
135  void cuda_cadd(void *a, real *c, int *n) {
136 
137  const dim3 nthrds(1024, 1, 1);
138  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
139 
140  cadd_kernel<real><<<nblcks, nthrds, 0,
141  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
142  CUDA_CHECK(cudaGetLastError());
143 
144  }
145 
150  void cuda_cadd2(void *a, void *b, real *c, int *n) {
151 
152  const dim3 nthrds(1024, 1, 1);
153  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
154 
155  cadd2_kernel<real><<<nblcks, nthrds, 0,
156  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
157  CUDA_CHECK(cudaGetLastError());
158 
159  }
160 
164  void cuda_cfill(void *a, real *c, int *n) {
165 
166  const dim3 nthrds(1024, 1, 1);
167  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
168 
169  if (*n > 0){
170  cfill_kernel<real><<<nblcks, nthrds, 0,
171  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
172  CUDA_CHECK(cudaGetLastError());
173  }
174 
175  }
176 
181  void cuda_add2(void *a, void *b, int *n) {
182 
183  const dim3 nthrds(1024, 1, 1);
184  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
185 
186  add2_kernel<real><<<nblcks, nthrds, 0,
187  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
188  CUDA_CHECK(cudaGetLastError());
189 
190  }
191 
196  void cuda_add3(void *a, void *b, void *c, int *n) {
197 
198  const dim3 nthrds(1024, 1, 1);
199  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
200 
201  add3_kernel<real><<<nblcks, nthrds, 0,
202  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
203  (real *) c, *n);
204  CUDA_CHECK(cudaGetLastError());
205  }
206 
211  void cuda_add4(void *a, void *b, void *c, void *d, int *n) {
212 
213  const dim3 nthrds(1024, 1, 1);
214  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
215 
216  add4_kernel<real><<<nblcks, nthrds, 0,
217  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, (real *) d, *n);
218  CUDA_CHECK(cudaGetLastError());
219 
220  }
226  void cuda_add2s1(void *a, void *b, real *c1, int *n) {
227 
228  const dim3 nthrds(1024, 1, 1);
229  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
230 
231  add2s1_kernel<real><<<nblcks, nthrds, 0,
232  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
233  CUDA_CHECK(cudaGetLastError());
234 
235  }
236 
242  void cuda_add2s2(void *a, void *b, real *c1, int *n) {
243 
244  const dim3 nthrds(1024, 1, 1);
245  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
246 
247  add2s2_kernel<real><<<nblcks, nthrds, 0,
248  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
249  CUDA_CHECK(cudaGetLastError());
250 
251  }
252 
259  void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
260 
261  const dim3 nthrds(1024, 1, 1);
262  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
263 
264  add2s2_many_kernel<real><<<nblcks, nthrds, 0,
265  (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
266  (real *) alpha, *j, *n);
267  CUDA_CHECK(cudaGetLastError());
268 
269  }
270 
276  void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
277 
278  const dim3 nthrds(1024, 1, 1);
279  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
280 
281  addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
282  (real *) b,
283  *c1, *n);
284  CUDA_CHECK(cudaGetLastError());
285 
286  }
287 
293  void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
294 
295  const dim3 nthrds(1024, 1, 1);
296  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
297 
298  add3s2_kernel<real><<<nblcks, nthrds, 0,
299  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c,
300  *c1, *c2, *n);
301  CUDA_CHECK(cudaGetLastError());
302 
303  }
304 
309  void cuda_invcol1(void *a, int *n) {
310 
311  const dim3 nthrds(1024, 1, 1);
312  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
313 
314  invcol1_kernel<real><<<nblcks, nthrds, 0,
315  (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
316  CUDA_CHECK(cudaGetLastError());
317  }
318 
323  void cuda_invcol2(void *a, void *b, int *n) {
324 
325  const dim3 nthrds(1024, 1, 1);
326  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 
328  invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
329  (real *) b, *n);
330  CUDA_CHECK(cudaGetLastError());
331  }
332 
337  void cuda_col2(void *a, void *b, int *n) {
338 
339  const dim3 nthrds(1024, 1, 1);
340  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
341 
342  col2_kernel<real><<<nblcks, nthrds, 0,
343  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
344  CUDA_CHECK(cudaGetLastError());
345  }
346 
351  void cuda_col3(void *a, void *b, void *c, int *n) {
352 
353  const dim3 nthrds(1024, 1, 1);
354  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355 
356  col3_kernel<real><<<nblcks, nthrds, 0,
357  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
358  CUDA_CHECK(cudaGetLastError());
359  }
360 
365  void cuda_subcol3(void *a, void *b, void *c, int *n) {
366 
367  const dim3 nthrds(1024, 1, 1);
368  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
369 
370  subcol3_kernel<real><<<nblcks, nthrds, 0,
371  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
372  CUDA_CHECK(cudaGetLastError());
373  }
374 
375 
380  void cuda_sub2(void *a, void *b, int *n) {
381 
382  const dim3 nthrds(1024, 1, 1);
383  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
384 
385  sub2_kernel<real><<<nblcks, nthrds, 0,
386  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
387  CUDA_CHECK(cudaGetLastError());
388  }
389 
394  void cuda_sub3(void *a, void *b, void *c, int *n) {
395 
396  const dim3 nthrds(1024, 1, 1);
397  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
398 
399  sub3_kernel<real><<<nblcks, nthrds, 0,
400  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
401  (real *) c, *n);
402  CUDA_CHECK(cudaGetLastError());
403  }
404 
409  void cuda_addcol3(void *a, void *b, void *c, int *n) {
410 
411  const dim3 nthrds(1024, 1, 1);
412  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
413 
414  addcol3_kernel<real><<<nblcks, nthrds, 0,
415  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
416  (real *) c, *n);
417  CUDA_CHECK(cudaGetLastError());
418  }
419 
424  void cuda_addcol4(void *a, void *b, void *c, void *d, int *n) {
425 
426  const dim3 nthrds(1024, 1, 1);
427  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
428 
429  addcol4_kernel<real><<<nblcks, nthrds, 0,
430  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
431  (real *) c, (real *) d, *n);
432  CUDA_CHECK(cudaGetLastError());
433  }
434 
439  void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
440  void *v1, void *v2, void *v3, int *n) {
441 
442  const dim3 nthrds(1024, 1, 1);
443  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444 
445  vdot3_kernel<real><<<nblcks, nthrds, 0,
446  (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
447  (real *) u2, (real *) u3,
448  (real *) v1, (real *) v2,
449  (real *) v3, *n);
450  CUDA_CHECK(cudaGetLastError());
451  }
452 
457  void cuda_vcross(void *u1, void *u2, void *u3,
458  void *v1, void *v2, void *v3,
459  void *w1, void *w2, void *w3, int *n) {
460 
461  const dim3 nthrds(1024, 1, 1);
462  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
463 
464  vcross_kernel<real><<<nblcks, nthrds, 0,
465  (cudaStream_t) glb_cmd_queue>>>((real *) u1,
466  (real *) u2, (real *) u3,
467  (real *) v1, (real *) v2,
468  (real *) v3,
469  (real *) w1, (real *) w2,
470  (real *) w3, *n);
471  CUDA_CHECK(cudaGetLastError());
472  }
473 
474 
475  /*
476  * Reduction buffer
477  */
478  int red_s = 0;
479  real * bufred = NULL;
480  real * bufred_d = NULL;
481 
486  real cuda_vlsc3(void *u, void *v, void *w, int *n) {
487 
488  const dim3 nthrds(1024, 1, 1);
489  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490  const int nb = ((*n) + 1024 - 1)/ 1024;
491  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
492 
493  if ( nb > red_s){
494  red_s = nb;
495  if (bufred != NULL) {
496  CUDA_CHECK(cudaFreeHost(bufred));
497  CUDA_CHECK(cudaFree(bufred_d));
498  }
499  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
500  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
501  }
502 
503  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
504  ((real *) u, (real *) v, (real *) w, bufred_d, *n);
505  CUDA_CHECK(cudaGetLastError());
506  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
507  CUDA_CHECK(cudaGetLastError());
508 
509  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
510  cudaMemcpyDeviceToHost, stream));
511  cudaStreamSynchronize(stream);
512 
513  return bufred[0];
514  }
515 
520  real cuda_glsc3(void *a, void *b, void *c, int *n) {
521 
522  const dim3 nthrds(1024, 1, 1);
523  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
524  const int nb = ((*n) + 1024 - 1)/ 1024;
525  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
526 
527  if ( nb > red_s){
528  red_s = nb;
529  if (bufred != NULL) {
530  CUDA_CHECK(cudaFreeHost(bufred));
531  CUDA_CHECK(cudaFree(bufred_d));
532  }
533  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
534  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
535  }
536 
537  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
538  ((real *) a, (real *) b, (real *) c, bufred_d, *n);
539  CUDA_CHECK(cudaGetLastError());
540  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
541  CUDA_CHECK(cudaGetLastError());
542 
543 #ifdef HAVE_DEVICE_MPI
544  cudaStreamSynchronize(stream);
546 #else
547  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
548  cudaMemcpyDeviceToHost, stream));
549  cudaStreamSynchronize(stream);
550 #endif
551 
552  return bufred[0];
553  }
554 
559  void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
560  int pow2 = 1;
561  while(pow2 < (*j)){
562  pow2 = 2*pow2;
563  }
564  const int nt = 1024/pow2;
565  const dim3 nthrds(pow2, nt, 1);
566  const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
567  const int nb = ((*n) + nt - 1)/nt;
568  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
569 
570  if((*j)*nb>red_s){
571  red_s = (*j)*nb;
572  if (bufred != NULL) {
573  CUDA_CHECK(cudaFreeHost(bufred));
574  CUDA_CHECK(cudaFree(bufred_d));
575  }
576  CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
577  CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
578  }
579 
580  glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
581  ((const real *) w, (const real **) v,
582  (const real *)mult, bufred_d, *j, *n);
583  CUDA_CHECK(cudaGetLastError());
584  glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(bufred_d, nb, *j);
585  CUDA_CHECK(cudaGetLastError());
586 
587 #ifdef HAVE_DEVICE_MPI
588  cudaStreamSynchronize(stream);
590 #else
591  CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
592  cudaMemcpyDeviceToHost, stream));
593  cudaStreamSynchronize(stream);
594 #endif
595  }
596 
601  real cuda_glsc2(void *a, void *b, int *n) {
602 
603  const dim3 nthrds(1024, 1, 1);
604  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
605  const int nb = ((*n) + 1024 - 1)/ 1024;
606  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
607 
608  if ( nb > red_s){
609  red_s = nb;
610  if (bufred != NULL) {
611  CUDA_CHECK(cudaFreeHost(bufred));
612  CUDA_CHECK(cudaFree(bufred_d));
613  }
614  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
615  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
616  }
617 
618  glsc2_kernel<real>
619  <<<nblcks, nthrds, 0, stream>>>((real *) a,
620  (real *) b,
621  bufred_d, *n);
622  CUDA_CHECK(cudaGetLastError());
623  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
624  CUDA_CHECK(cudaGetLastError());
625 
626 #ifdef HAVE_DEVICE_MPI
627  cudaStreamSynchronize(stream);
629 #else
630  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
631  cudaMemcpyDeviceToHost, stream));
632  cudaStreamSynchronize(stream);
633 #endif
634 
635  return bufred[0];
636  }
637 
642  real cuda_glsum(void *a, int *n) {
643  const dim3 nthrds(1024, 1, 1);
644  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
645  const int nb = ((*n) + 1024 - 1)/ 1024;
646  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
647 
648  if ( nb > red_s){
649  red_s = nb;
650  if (bufred != NULL) {
651  CUDA_CHECK(cudaFreeHost(bufred));
652  CUDA_CHECK(cudaFree(bufred_d));
653  }
654  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
655  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
656  }
657  if ( *n > 0) {
658  glsum_kernel<real>
659  <<<nblcks, nthrds, 0, stream>>>((real *) a, bufred_d, *n);
660  CUDA_CHECK(cudaGetLastError());
661  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
662  CUDA_CHECK(cudaGetLastError());
663  }
664 #ifdef HAVE_DEVICE_MPI
665  cudaStreamSynchronize(stream);
667 #else
668  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
669  cudaMemcpyDeviceToHost, stream));
670  cudaStreamSynchronize(stream);
671 #endif
672 
673  return bufred[0];
674  }
675 
680  void cuda_absval(void *a, int *n) {
681 
682  const dim3 nthrds(1024, 1, 1);
683  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
684  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
685 
686  absval_kernel<real>
687  <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
688  CUDA_CHECK(cudaGetLastError());
689 
690  }
691 
692  // ======================================================================== //
693  // Point-wise operations.
694 
699  void cuda_pwmax_vec2(void *a, void *b, int *n) {
700 
701  const dim3 nthrds(1024, 1, 1);
702  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
703  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
704 
705  pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
706  (real *)a, (real *)b, *n);
707  CUDA_CHECK(cudaGetLastError());
708  }
709 
714  void cuda_pwmax_vec3(void *a, void *b, void *c, int *n) {
715 
716  const dim3 nthrds(1024, 1, 1);
717  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
718  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
719 
720  pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
721  (real *)a, (real *)b, (real *)c, *n);
722  CUDA_CHECK(cudaGetLastError());
723  }
724 
729  void cuda_pwmax_sca2(void *a, real *c, int *n) {
730 
731  const dim3 nthrds(1024, 1, 1);
732  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
733  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
734 
735  pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
736  (real *)a, *c, *n);
737  CUDA_CHECK(cudaGetLastError());
738  }
739 
744  void cuda_pwmax_sca3(void *a, void *b, real *c, int *n) {
745 
746  const dim3 nthrds(1024, 1, 1);
747  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
748  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
749 
750  pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
751  (real *)a, (real *)b, *c, *n);
752  CUDA_CHECK(cudaGetLastError());
753  }
754 
759  void cuda_pwmin_vec2(void *a, void *b, int *n) {
760 
761  const dim3 nthrds(1024, 1, 1);
762  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
763  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
764 
765  pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
766  (real *)a, (real *)b, *n);
767  CUDA_CHECK(cudaGetLastError());
768  }
769 
774  void cuda_pwmin_vec3(void *a, void *b, void *c, int *n) {
775 
776  const dim3 nthrds(1024, 1, 1);
777  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
778  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
779 
780  pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
781  (real *)a, (real *)b, (real *)c, *n);
782  CUDA_CHECK(cudaGetLastError());
783  }
784 
789  void cuda_pwmin_sca2(void *a, real *c, int *n) {
790 
791  const dim3 nthrds(1024, 1, 1);
792  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
793  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
794 
795  pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
796  (real *)a, *c, *n);
797  CUDA_CHECK(cudaGetLastError());
798  }
799 
804  void cuda_pwmin_sca3(void *a, void *b, real *c, int *n) {
805 
806  const dim3 nthrds(1024, 1, 1);
807  const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
808  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
809 
810  pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
811  (real *)a, (real *)b, *c, *n);
812  CUDA_CHECK(cudaGetLastError());
813  }
814 
815 } /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
#define CUDA_CHECK(err)
Definition: check.h:6
double real
Definition: device_config.h:12
void * glb_cmd_queue
#define DEVICE_MPI_SUM
Definition: device_mpi_op.h:9
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.cu:72
void cuda_absval(void *a, int *n)
Definition: math.cu:680
void cuda_invcol1(void *a, int *n)
Definition: math.cu:309
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition: math.cu:259
void cuda_cadd2(void *a, void *b, real *c, int *n)
Definition: math.cu:150
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n)
Definition: math.cu:744
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n)
Definition: math.cu:774
real cuda_vlsc3(void *u, void *v, void *w, int *n)
Definition: math.cu:486
void cuda_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:242
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.cu:58
void cuda_add3(void *a, void *b, void *c, int *n)
Definition: math.cu:196
void cuda_col2(void *a, void *b, int *n)
Definition: math.cu:337
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.cu:559
void cuda_add4(void *a, void *b, void *c, void *d, int *n)
Definition: math.cu:211
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.cu:439
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n)
Definition: math.cu:804
void cuda_addcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:409
void cuda_pwmax_sca2(void *a, real *c, int *n)
Definition: math.cu:729
void cuda_subcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:365
void cuda_cmult(void *a, real *c, int *n)
Definition: math.cu:107
real * bufred
Definition: math.cu:479
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:276
void cuda_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.cu:226
real cuda_glsum(void *a, int *n)
Definition: math.cu:642
real cuda_glsc2(void *a, void *b, int *n)
Definition: math.cu:601
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.cu:293
int red_s
Definition: math.cu:478
void cuda_rzero(void *a, int *n)
Definition: math.cu:99
void cuda_pwmin_sca2(void *a, real *c, int *n)
Definition: math.cu:789
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n)
Definition: math.cu:714
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.cu:424
void cuda_add2(void *a, void *b, int *n)
Definition: math.cu:181
void cuda_copy(void *a, void *b, int *n)
Definition: math.cu:49
void cuda_pwmax_vec2(void *a, void *b, int *n)
Definition: math.cu:699
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
Definition: math.cu:86
void cuda_invcol2(void *a, void *b, int *n)
Definition: math.cu:323
void cuda_pwmin_vec2(void *a, void *b, int *n)
Definition: math.cu:759
void cuda_col3(void *a, void *b, void *c, int *n)
Definition: math.cu:351
void cuda_cfill(void *a, real *c, int *n)
Definition: math.cu:164
void cuda_cadd(void *a, real *c, int *n)
Definition: math.cu:135
void cuda_sub2(void *a, void *b, int *n)
Definition: math.cu:380
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition: math.cu:457
real cuda_glsc3(void *a, void *b, void *c, int *n)
Definition: math.cu:520
real * bufred_d
Definition: math.cu:480
void cuda_cmult2(void *a, void *b, real *c, int *n)
Definition: math.cu:121
void cuda_sub3(void *a, void *b, void *c, int *n)
Definition: math.cu:394