Neko  0.8.1
A portable framework for high-order spectral element flow simulations
math.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include "math_kernel.h"
36 #include <device/device_config.h>
37 #include <device/cuda/check.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 
41 extern "C" {
42 
45 
49  void cuda_copy(void *a, void *b, int *n) {
50  CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
51  cudaMemcpyDeviceToDevice,
52  (cudaStream_t) glb_cmd_queue));
53  }
54 
58  void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
59 
60  const dim3 nthrds(1024, 1, 1);
61  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 
63  masked_copy_kernel<real><<<nblcks, nthrds, 0,
64  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
65  CUDA_CHECK(cudaGetLastError());
66 
67  }
68 
69 
73  void cuda_rzero(void *a, int *n) {
74  CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real),
75  (cudaStream_t) glb_cmd_queue));
76  }
77 
81  void cuda_cmult(void *a, real *c, int *n) {
82 
83  const dim3 nthrds(1024, 1, 1);
84  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
85 
86  cmult_kernel<real><<<nblcks, nthrds, 0,
87  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
88  CUDA_CHECK(cudaGetLastError());
89 
90  }
91 
95  void cuda_cmult2(void *a, void *b, real *c, int *n) {
96 
97  const dim3 nthrds(1024, 1, 1);
98  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
99 
100  cmult2_kernel<real><<<nblcks, nthrds, 0,
101  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
102  CUDA_CHECK(cudaGetLastError());
103 
104  }
105 
109  void cuda_cadd(void *a, real *c, int *n) {
110 
111  const dim3 nthrds(1024, 1, 1);
112  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
113 
114  cadd_kernel<real><<<nblcks, nthrds, 0,
115  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
116  CUDA_CHECK(cudaGetLastError());
117 
118  }
119 
120 
124  void cuda_cfill(void *a, real *c, int *n) {
125 
126  const dim3 nthrds(1024, 1, 1);
127  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
128 
129  cfill_kernel<real><<<nblcks, nthrds, 0,
130  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
131  CUDA_CHECK(cudaGetLastError());
132 
133  }
134 
139  void cuda_add2(void *a, void *b, int *n) {
140 
141  const dim3 nthrds(1024, 1, 1);
142  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
143 
144  add2_kernel<real><<<nblcks, nthrds, 0,
145  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
146  CUDA_CHECK(cudaGetLastError());
147 
148  }
149 
155  void cuda_add2s1(void *a, void *b, real *c1, int *n) {
156 
157  const dim3 nthrds(1024, 1, 1);
158  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
159 
160  add2s1_kernel<real><<<nblcks, nthrds, 0,
161  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
162  CUDA_CHECK(cudaGetLastError());
163 
164  }
165 
171  void cuda_add2s2(void *a, void *b, real *c1, int *n) {
172 
173  const dim3 nthrds(1024, 1, 1);
174  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
175 
176  add2s2_kernel<real><<<nblcks, nthrds, 0,
177  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
178  CUDA_CHECK(cudaGetLastError());
179 
180  }
181 
188  void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
189 
190  const dim3 nthrds(1024, 1, 1);
191  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
192 
193  add2s2_many_kernel<real><<<nblcks, nthrds, 0,
194  (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
195  (real *) alpha, *j, *n);
196  CUDA_CHECK(cudaGetLastError());
197 
198  }
199 
205  void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
206 
207  const dim3 nthrds(1024, 1, 1);
208  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
209 
210  addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
211  (real *) b,
212  *c1, *n);
213  CUDA_CHECK(cudaGetLastError());
214 
215  }
216 
222  void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
223 
224  const dim3 nthrds(1024, 1, 1);
225  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
226 
227  add3s2_kernel<real><<<nblcks, nthrds, 0,
228  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c,
229  *c1, *c2, *n);
230  CUDA_CHECK(cudaGetLastError());
231 
232  }
233 
238  void cuda_invcol1(void *a, int *n) {
239 
240  const dim3 nthrds(1024, 1, 1);
241  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
242 
243  invcol1_kernel<real><<<nblcks, nthrds, 0,
244  (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
245  CUDA_CHECK(cudaGetLastError());
246  }
247 
252  void cuda_invcol2(void *a, void *b, int *n) {
253 
254  const dim3 nthrds(1024, 1, 1);
255  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
256 
257  invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
258  (real *) b, *n);
259  CUDA_CHECK(cudaGetLastError());
260  }
261 
266  void cuda_col2(void *a, void *b, int *n) {
267 
268  const dim3 nthrds(1024, 1, 1);
269  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
270 
271  col2_kernel<real><<<nblcks, nthrds, 0,
272  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
273  CUDA_CHECK(cudaGetLastError());
274  }
275 
280  void cuda_col3(void *a, void *b, void *c, int *n) {
281 
282  const dim3 nthrds(1024, 1, 1);
283  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
284 
285  col3_kernel<real><<<nblcks, nthrds, 0,
286  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
287  CUDA_CHECK(cudaGetLastError());
288  }
289 
294  void cuda_subcol3(void *a, void *b, void *c, int *n) {
295 
296  const dim3 nthrds(1024, 1, 1);
297  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
298 
299  subcol3_kernel<real><<<nblcks, nthrds, 0,
300  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
301  CUDA_CHECK(cudaGetLastError());
302  }
303 
304 
309  void cuda_sub2(void *a, void *b, int *n) {
310 
311  const dim3 nthrds(1024, 1, 1);
312  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
313 
314  sub2_kernel<real><<<nblcks, nthrds, 0,
315  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
316  CUDA_CHECK(cudaGetLastError());
317  }
318 
323  void cuda_sub3(void *a, void *b, void *c, int *n) {
324 
325  const dim3 nthrds(1024, 1, 1);
326  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 
328  sub3_kernel<real><<<nblcks, nthrds, 0,
329  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
330  (real *) c, *n);
331  CUDA_CHECK(cudaGetLastError());
332  }
333 
338  void cuda_addcol3(void *a, void *b, void *c, int *n) {
339 
340  const dim3 nthrds(1024, 1, 1);
341  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
342 
343  addcol3_kernel<real><<<nblcks, nthrds, 0,
344  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
345  (real *) c, *n);
346  CUDA_CHECK(cudaGetLastError());
347  }
348 
353  void cuda_addcol4(void *a, void *b, void *c, void *d, int *n) {
354 
355  const dim3 nthrds(1024, 1, 1);
356  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
357 
358  addcol4_kernel<real><<<nblcks, nthrds, 0,
359  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
360  (real *) c, (real *) d, *n);
361  CUDA_CHECK(cudaGetLastError());
362  }
363 
368  void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
369  void *v1, void *v2, void *v3, int *n) {
370 
371  const dim3 nthrds(1024, 1, 1);
372  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
373 
374  vdot3_kernel<real><<<nblcks, nthrds, 0,
375  (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
376  (real *) u2, (real *) u3,
377  (real *) v1, (real *) v2,
378  (real *) v3, *n);
379  CUDA_CHECK(cudaGetLastError());
380  }
381 
382  /*
383  * Reduction buffer
384  */
385  int red_s = 0;
386  real * bufred = NULL;
387  real * bufred_d = NULL;
388 
393  real cuda_vlsc3(void *u, void *v, void *w, int *n) {
394 
395  const dim3 nthrds(1024, 1, 1);
396  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
397  const int nb = ((*n) + 1024 - 1)/ 1024;
398  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
399 
400  if ( nb > red_s){
401  red_s = nb;
402  if (bufred != NULL) {
403  CUDA_CHECK(cudaFreeHost(bufred));
404  CUDA_CHECK(cudaFree(bufred_d));
405  }
406  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
407  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
408  }
409 
410  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
411  ((real *) u, (real *) v, (real *) w, bufred_d, *n);
412  CUDA_CHECK(cudaGetLastError());
413  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
414  CUDA_CHECK(cudaGetLastError());
415 
416  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
417  cudaMemcpyDeviceToHost, stream));
418  cudaStreamSynchronize(stream);
419 
420  return bufred[0];
421  }
422 
427  real cuda_glsc3(void *a, void *b, void *c, int *n) {
428 
429  const dim3 nthrds(1024, 1, 1);
430  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
431  const int nb = ((*n) + 1024 - 1)/ 1024;
432  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
433 
434  if ( nb > red_s){
435  red_s = nb;
436  if (bufred != NULL) {
437  CUDA_CHECK(cudaFreeHost(bufred));
438  CUDA_CHECK(cudaFree(bufred_d));
439  }
440  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
441  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
442  }
443 
444  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
445  ((real *) a, (real *) b, (real *) c, bufred_d, *n);
446  CUDA_CHECK(cudaGetLastError());
447  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
448  CUDA_CHECK(cudaGetLastError());
449 
450 #ifdef HAVE_DEVICE_MPI
451  cudaStreamSynchronize(stream);
453 #else
454  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
455  cudaMemcpyDeviceToHost, stream));
456  cudaStreamSynchronize(stream);
457 #endif
458 
459  return bufred[0];
460  }
461 
466  void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
467  int pow2 = 1;
468  while(pow2 < (*j)){
469  pow2 = 2*pow2;
470  }
471  const int nt = 1024/pow2;
472  const dim3 nthrds(pow2, nt, 1);
473  const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
474  const int nb = ((*n) + nt - 1)/nt;
475  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
476 
477  if((*j)*nb>red_s){
478  red_s = (*j)*nb;
479  if (bufred != NULL) {
480  CUDA_CHECK(cudaFreeHost(bufred));
481  CUDA_CHECK(cudaFree(bufred_d));
482  }
483  CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
484  CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
485  }
486 
487  glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
488  ((const real *) w, (const real **) v,
489  (const real *)mult, bufred_d, *j, *n);
490  CUDA_CHECK(cudaGetLastError());
491  glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(bufred_d, nb, *j);
492  CUDA_CHECK(cudaGetLastError());
493 
494 #ifdef HAVE_DEVICE_MPI
495  cudaStreamSynchronize(stream);
497 #else
498  CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
499  cudaMemcpyDeviceToHost, stream));
500  cudaStreamSynchronize(stream);
501 #endif
502  }
503 
508  real cuda_glsc2(void *a, void *b, int *n) {
509 
510  const dim3 nthrds(1024, 1, 1);
511  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
512  const int nb = ((*n) + 1024 - 1)/ 1024;
513  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
514 
515  if ( nb > red_s){
516  red_s = nb;
517  if (bufred != NULL) {
518  CUDA_CHECK(cudaFreeHost(bufred));
519  CUDA_CHECK(cudaFree(bufred_d));
520  }
521  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
522  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
523  }
524 
525  glsc2_kernel<real>
526  <<<nblcks, nthrds, 0, stream>>>((real *) a,
527  (real *) b,
528  bufred_d, *n);
529  CUDA_CHECK(cudaGetLastError());
530  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
531  CUDA_CHECK(cudaGetLastError());
532 
533 #ifdef HAVE_DEVICE_MPI
534  cudaStreamSynchronize(stream);
536 #else
537  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
538  cudaMemcpyDeviceToHost, stream));
539  cudaStreamSynchronize(stream);
540 #endif
541 
542  return bufred[0];
543  }
544 
549  real cuda_glsum(void *a, int *n) {
550  const dim3 nthrds(1024, 1, 1);
551  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
552  const int nb = ((*n) + 1024 - 1)/ 1024;
553  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
554 
555  if ( nb > red_s){
556  red_s = nb;
557  if (bufred != NULL) {
558  CUDA_CHECK(cudaFreeHost(bufred));
559  CUDA_CHECK(cudaFree(bufred_d));
560  }
561  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
562  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
563  }
564 
565  glsum_kernel<real>
566  <<<nblcks, nthrds, 0, stream>>>((real *) a, bufred_d, *n);
567  CUDA_CHECK(cudaGetLastError());
568  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
569  CUDA_CHECK(cudaGetLastError());
570 
571 #ifdef HAVE_DEVICE_MPI
572  cudaStreamSynchronize(stream);
574 #else
575  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
576  cudaMemcpyDeviceToHost, stream));
577  cudaStreamSynchronize(stream);
578 #endif
579 
580  return bufred[0];
581  }
582 
583 }
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:109
const int j
Definition: cdtp_kernel.h:131
#define CUDA_CHECK(err)
Definition: check.h:6
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
double real
Definition: device_config.h:12
void * glb_cmd_queue
#define DEVICE_MPI_SUM
Definition: device_mpi_op.h:9
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_invcol1(void *a, int *n)
Definition: math.cu:238
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition: math.cu:188
real cuda_vlsc3(void *u, void *v, void *w, int *n)
Definition: math.cu:393
void cuda_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:171
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.cu:58
void cuda_col2(void *a, void *b, int *n)
Definition: math.cu:266
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.cu:466
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.cu:368
void cuda_addcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:338
void cuda_subcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:294
void cuda_cmult(void *a, real *c, int *n)
Definition: math.cu:81
real * bufred
Definition: math.cu:386
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:205
void cuda_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.cu:155
real cuda_glsum(void *a, int *n)
Definition: math.cu:549
real cuda_glsc2(void *a, void *b, int *n)
Definition: math.cu:508
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.cu:222
int red_s
Definition: math.cu:385
void cuda_rzero(void *a, int *n)
Definition: math.cu:73
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.cu:353
void cuda_add2(void *a, void *b, int *n)
Definition: math.cu:139
void cuda_copy(void *a, void *b, int *n)
Definition: math.cu:49
void cuda_invcol2(void *a, void *b, int *n)
Definition: math.cu:252
void cuda_col3(void *a, void *b, void *c, int *n)
Definition: math.cu:280
void cuda_cfill(void *a, real *c, int *n)
Definition: math.cu:124
void cuda_cadd(void *a, real *c, int *n)
Definition: math.cu:109
void cuda_sub2(void *a, void *b, int *n)
Definition: math.cu:309
real cuda_glsc3(void *a, void *b, void *c, int *n)
Definition: math.cu:427
real * bufred_d
Definition: math.cu:387
void cuda_cmult2(void *a, void *b, real *c, int *n)
Definition: math.cu:95
void cuda_sub3(void *a, void *b, void *c, int *n)
Definition: math.cu:323