Neko  0.8.99
A portable framework for high-order spectral element flow simulations
math.cu
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include "math_kernel.h"
36 #include <device/device_config.h>
37 #include <device/cuda/check.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 
41 extern "C" {
42 
45 
49  void cuda_copy(void *a, void *b, int *n) {
50  CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
51  cudaMemcpyDeviceToDevice,
52  (cudaStream_t) glb_cmd_queue));
53  }
54 
58  void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
59 
60  const dim3 nthrds(1024, 1, 1);
61  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62 
63  masked_copy_kernel<real><<<nblcks, nthrds, 0,
64  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
65  CUDA_CHECK(cudaGetLastError());
66 
67  }
68 
72  void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size) {
73 
74  const dim3 nthrds(1024, 1, 1);
75  const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
76 
77  cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
78  (real*)a, *c, *size, mask, *mask_size);
79  CUDA_CHECK(cudaGetLastError());
80  }
81 
85  void cuda_rzero(void *a, int *n) {
86  CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real),
87  (cudaStream_t) glb_cmd_queue));
88  }
89 
93  void cuda_cmult(void *a, real *c, int *n) {
94 
95  const dim3 nthrds(1024, 1, 1);
96  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
97 
98  cmult_kernel<real><<<nblcks, nthrds, 0,
99  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
100  CUDA_CHECK(cudaGetLastError());
101 
102  }
103 
107  void cuda_cmult2(void *a, void *b, real *c, int *n) {
108 
109  const dim3 nthrds(1024, 1, 1);
110  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
111 
112  cmult2_kernel<real><<<nblcks, nthrds, 0,
113  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
114  CUDA_CHECK(cudaGetLastError());
115 
116  }
117 
121  void cuda_cadd(void *a, real *c, int *n) {
122 
123  const dim3 nthrds(1024, 1, 1);
124  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
125 
126  cadd_kernel<real><<<nblcks, nthrds, 0,
127  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
128  CUDA_CHECK(cudaGetLastError());
129 
130  }
131 
136  void cuda_cadd2(void *a, void *b, real *c, int *n) {
137 
138  const dim3 nthrds(1024, 1, 1);
139  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
140 
141  cadd2_kernel<real><<<nblcks, nthrds, 0,
142  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
143  CUDA_CHECK(cudaGetLastError());
144 
145  }
146 
150  void cuda_cfill(void *a, real *c, int *n) {
151 
152  const dim3 nthrds(1024, 1, 1);
153  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
154 
155  cfill_kernel<real><<<nblcks, nthrds, 0,
156  (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
157  CUDA_CHECK(cudaGetLastError());
158 
159  }
160 
165  void cuda_add2(void *a, void *b, int *n) {
166 
167  const dim3 nthrds(1024, 1, 1);
168  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
169 
170  add2_kernel<real><<<nblcks, nthrds, 0,
171  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
172  CUDA_CHECK(cudaGetLastError());
173 
174  }
175 
180  void cuda_add3(void *a, void *b, void *c, int *n) {
181 
182  const dim3 nthrds(1024, 1, 1);
183  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
184 
185  add3_kernel<real><<<nblcks, nthrds, 0,
186  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
187  (real *) c, *n);
188  CUDA_CHECK(cudaGetLastError());
189  }
190 
196  void cuda_add2s1(void *a, void *b, real *c1, int *n) {
197 
198  const dim3 nthrds(1024, 1, 1);
199  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
200 
201  add2s1_kernel<real><<<nblcks, nthrds, 0,
202  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
203  CUDA_CHECK(cudaGetLastError());
204 
205  }
206 
212  void cuda_add2s2(void *a, void *b, real *c1, int *n) {
213 
214  const dim3 nthrds(1024, 1, 1);
215  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
216 
217  add2s2_kernel<real><<<nblcks, nthrds, 0,
218  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
219  CUDA_CHECK(cudaGetLastError());
220 
221  }
222 
229  void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
230 
231  const dim3 nthrds(1024, 1, 1);
232  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
233 
234  add2s2_many_kernel<real><<<nblcks, nthrds, 0,
235  (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
236  (real *) alpha, *j, *n);
237  CUDA_CHECK(cudaGetLastError());
238 
239  }
240 
246  void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
247 
248  const dim3 nthrds(1024, 1, 1);
249  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
250 
251  addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
252  (real *) b,
253  *c1, *n);
254  CUDA_CHECK(cudaGetLastError());
255 
256  }
257 
263  void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
264 
265  const dim3 nthrds(1024, 1, 1);
266  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
267 
268  add3s2_kernel<real><<<nblcks, nthrds, 0,
269  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c,
270  *c1, *c2, *n);
271  CUDA_CHECK(cudaGetLastError());
272 
273  }
274 
279  void cuda_invcol1(void *a, int *n) {
280 
281  const dim3 nthrds(1024, 1, 1);
282  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
283 
284  invcol1_kernel<real><<<nblcks, nthrds, 0,
285  (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
286  CUDA_CHECK(cudaGetLastError());
287  }
288 
293  void cuda_invcol2(void *a, void *b, int *n) {
294 
295  const dim3 nthrds(1024, 1, 1);
296  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
297 
298  invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
299  (real *) b, *n);
300  CUDA_CHECK(cudaGetLastError());
301  }
302 
307  void cuda_col2(void *a, void *b, int *n) {
308 
309  const dim3 nthrds(1024, 1, 1);
310  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
311 
312  col2_kernel<real><<<nblcks, nthrds, 0,
313  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
314  CUDA_CHECK(cudaGetLastError());
315  }
316 
321  void cuda_col3(void *a, void *b, void *c, int *n) {
322 
323  const dim3 nthrds(1024, 1, 1);
324  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
325 
326  col3_kernel<real><<<nblcks, nthrds, 0,
327  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
328  CUDA_CHECK(cudaGetLastError());
329  }
330 
335  void cuda_subcol3(void *a, void *b, void *c, int *n) {
336 
337  const dim3 nthrds(1024, 1, 1);
338  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
339 
340  subcol3_kernel<real><<<nblcks, nthrds, 0,
341  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
342  CUDA_CHECK(cudaGetLastError());
343  }
344 
345 
350  void cuda_sub2(void *a, void *b, int *n) {
351 
352  const dim3 nthrds(1024, 1, 1);
353  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
354 
355  sub2_kernel<real><<<nblcks, nthrds, 0,
356  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
357  CUDA_CHECK(cudaGetLastError());
358  }
359 
364  void cuda_sub3(void *a, void *b, void *c, int *n) {
365 
366  const dim3 nthrds(1024, 1, 1);
367  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
368 
369  sub3_kernel<real><<<nblcks, nthrds, 0,
370  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
371  (real *) c, *n);
372  CUDA_CHECK(cudaGetLastError());
373  }
374 
379  void cuda_addcol3(void *a, void *b, void *c, int *n) {
380 
381  const dim3 nthrds(1024, 1, 1);
382  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
383 
384  addcol3_kernel<real><<<nblcks, nthrds, 0,
385  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
386  (real *) c, *n);
387  CUDA_CHECK(cudaGetLastError());
388  }
389 
394  void cuda_addcol4(void *a, void *b, void *c, void *d, int *n) {
395 
396  const dim3 nthrds(1024, 1, 1);
397  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
398 
399  addcol4_kernel<real><<<nblcks, nthrds, 0,
400  (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
401  (real *) c, (real *) d, *n);
402  CUDA_CHECK(cudaGetLastError());
403  }
404 
409  void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
410  void *v1, void *v2, void *v3, int *n) {
411 
412  const dim3 nthrds(1024, 1, 1);
413  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
414 
415  vdot3_kernel<real><<<nblcks, nthrds, 0,
416  (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
417  (real *) u2, (real *) u3,
418  (real *) v1, (real *) v2,
419  (real *) v3, *n);
420  CUDA_CHECK(cudaGetLastError());
421  }
422 
423  /*
424  * Reduction buffer
425  */
426  int red_s = 0;
427  real * bufred = NULL;
428  real * bufred_d = NULL;
429 
434  real cuda_vlsc3(void *u, void *v, void *w, int *n) {
435 
436  const dim3 nthrds(1024, 1, 1);
437  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
438  const int nb = ((*n) + 1024 - 1)/ 1024;
439  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
440 
441  if ( nb > red_s){
442  red_s = nb;
443  if (bufred != NULL) {
444  CUDA_CHECK(cudaFreeHost(bufred));
445  CUDA_CHECK(cudaFree(bufred_d));
446  }
447  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
448  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
449  }
450 
451  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
452  ((real *) u, (real *) v, (real *) w, bufred_d, *n);
453  CUDA_CHECK(cudaGetLastError());
454  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
455  CUDA_CHECK(cudaGetLastError());
456 
457  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
458  cudaMemcpyDeviceToHost, stream));
459  cudaStreamSynchronize(stream);
460 
461  return bufred[0];
462  }
463 
468  real cuda_glsc3(void *a, void *b, void *c, int *n) {
469 
470  const dim3 nthrds(1024, 1, 1);
471  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
472  const int nb = ((*n) + 1024 - 1)/ 1024;
473  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
474 
475  if ( nb > red_s){
476  red_s = nb;
477  if (bufred != NULL) {
478  CUDA_CHECK(cudaFreeHost(bufred));
479  CUDA_CHECK(cudaFree(bufred_d));
480  }
481  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
482  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
483  }
484 
485  glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
486  ((real *) a, (real *) b, (real *) c, bufred_d, *n);
487  CUDA_CHECK(cudaGetLastError());
488  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
489  CUDA_CHECK(cudaGetLastError());
490 
491 #ifdef HAVE_DEVICE_MPI
492  cudaStreamSynchronize(stream);
494 #else
495  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
496  cudaMemcpyDeviceToHost, stream));
497  cudaStreamSynchronize(stream);
498 #endif
499 
500  return bufred[0];
501  }
502 
507  void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
508  int pow2 = 1;
509  while(pow2 < (*j)){
510  pow2 = 2*pow2;
511  }
512  const int nt = 1024/pow2;
513  const dim3 nthrds(pow2, nt, 1);
514  const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
515  const int nb = ((*n) + nt - 1)/nt;
516  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
517 
518  if((*j)*nb>red_s){
519  red_s = (*j)*nb;
520  if (bufred != NULL) {
521  CUDA_CHECK(cudaFreeHost(bufred));
522  CUDA_CHECK(cudaFree(bufred_d));
523  }
524  CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
525  CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
526  }
527 
528  glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
529  ((const real *) w, (const real **) v,
530  (const real *)mult, bufred_d, *j, *n);
531  CUDA_CHECK(cudaGetLastError());
532  glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(bufred_d, nb, *j);
533  CUDA_CHECK(cudaGetLastError());
534 
535 #ifdef HAVE_DEVICE_MPI
536  cudaStreamSynchronize(stream);
538 #else
539  CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
540  cudaMemcpyDeviceToHost, stream));
541  cudaStreamSynchronize(stream);
542 #endif
543  }
544 
549  real cuda_glsc2(void *a, void *b, int *n) {
550 
551  const dim3 nthrds(1024, 1, 1);
552  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
553  const int nb = ((*n) + 1024 - 1)/ 1024;
554  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
555 
556  if ( nb > red_s){
557  red_s = nb;
558  if (bufred != NULL) {
559  CUDA_CHECK(cudaFreeHost(bufred));
560  CUDA_CHECK(cudaFree(bufred_d));
561  }
562  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
563  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
564  }
565 
566  glsc2_kernel<real>
567  <<<nblcks, nthrds, 0, stream>>>((real *) a,
568  (real *) b,
569  bufred_d, *n);
570  CUDA_CHECK(cudaGetLastError());
571  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
572  CUDA_CHECK(cudaGetLastError());
573 
574 #ifdef HAVE_DEVICE_MPI
575  cudaStreamSynchronize(stream);
577 #else
578  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
579  cudaMemcpyDeviceToHost, stream));
580  cudaStreamSynchronize(stream);
581 #endif
582 
583  return bufred[0];
584  }
585 
590  real cuda_glsum(void *a, int *n) {
591  const dim3 nthrds(1024, 1, 1);
592  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
593  const int nb = ((*n) + 1024 - 1)/ 1024;
594  const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
595 
596  if ( nb > red_s){
597  red_s = nb;
598  if (bufred != NULL) {
599  CUDA_CHECK(cudaFreeHost(bufred));
600  CUDA_CHECK(cudaFree(bufred_d));
601  }
602  CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
603  CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
604  }
605 
606  glsum_kernel<real>
607  <<<nblcks, nthrds, 0, stream>>>((real *) a, bufred_d, *n);
608  CUDA_CHECK(cudaGetLastError());
609  reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
610  CUDA_CHECK(cudaGetLastError());
611 
612 #ifdef HAVE_DEVICE_MPI
613  cudaStreamSynchronize(stream);
615 #else
616  CUDA_CHECK(cudaMemcpyAsync(bufred, bufred_d, sizeof(real),
617  cudaMemcpyDeviceToHost, stream));
618  cudaStreamSynchronize(stream);
619 #endif
620 
621  return bufred[0];
622  }
623 
624 }
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
const int j
Definition: cdtp_kernel.h:127
#define CUDA_CHECK(err)
Definition: check.h:6
__global__ void const T *__restrict__ u
Definition: conv1_kernel.h:132
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void const T *__restrict__ const T *__restrict__ v
double real
Definition: device_config.h:12
void * glb_cmd_queue
#define DEVICE_MPI_SUM
Definition: device_mpi_op.h:9
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_invcol1(void *a, int *n)
Definition: math.cu:279
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition: math.cu:229
void cuda_cadd2(void *a, void *b, real *c, int *n)
Definition: math.cu:136
real cuda_vlsc3(void *u, void *v, void *w, int *n)
Definition: math.cu:434
void cuda_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:212
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.cu:58
void cuda_add3(void *a, void *b, void *c, int *n)
Definition: math.cu:180
void cuda_col2(void *a, void *b, int *n)
Definition: math.cu:307
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.cu:507
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.cu:409
void cuda_addcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:379
void cuda_subcol3(void *a, void *b, void *c, int *n)
Definition: math.cu:335
void cuda_cmult(void *a, real *c, int *n)
Definition: math.cu:93
real * bufred
Definition: math.cu:427
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.cu:246
void cuda_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.cu:196
real cuda_glsum(void *a, int *n)
Definition: math.cu:590
real cuda_glsc2(void *a, void *b, int *n)
Definition: math.cu:549
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.cu:263
int red_s
Definition: math.cu:426
void cuda_rzero(void *a, int *n)
Definition: math.cu:85
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.cu:394
void cuda_add2(void *a, void *b, int *n)
Definition: math.cu:165
void cuda_copy(void *a, void *b, int *n)
Definition: math.cu:49
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
Definition: math.cu:72
void cuda_invcol2(void *a, void *b, int *n)
Definition: math.cu:293
void cuda_col3(void *a, void *b, void *c, int *n)
Definition: math.cu:321
void cuda_cfill(void *a, real *c, int *n)
Definition: math.cu:150
void cuda_cadd(void *a, real *c, int *n)
Definition: math.cu:121
void cuda_sub2(void *a, void *b, int *n)
Definition: math.cu:350
real cuda_glsc3(void *a, void *b, void *c, int *n)
Definition: math.cu:468
real * bufred_d
Definition: math.cu:428
void cuda_cmult2(void *a, void *b, real *c, int *n)
Definition: math.cu:107
void cuda_sub3(void *a, void *b, void *c, int *n)
Definition: math.cu:364