Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math.cu
Go to the documentation of this file.
1/*
2 Copyright (c) 2021-2023, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "math_kernel.h"
37#include <device/cuda/check.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41extern "C" {
42
45
49 void cuda_copy(void *a, void *b, int *n) {
50 CUDA_CHECK(cudaMemcpyAsync(a, b, (*n) * sizeof(real),
52 (cudaStream_t) glb_cmd_queue));
53 }
54
58 void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
59
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
62
64 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
66
67 }
68
72 void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
73
74 const dim3 nthrds(1024, 1, 1);
75 const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
76
78 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real*) b,(int*) mask, *n, *m);
80
81 }
82
86 void cuda_cfill_mask(void* a, real* c, int* size, int* mask, int* mask_size) {
87
88 const dim3 nthrds(1024, 1, 1);
89 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
90
91 cfill_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
92 (real*)a, *c, *size, mask, *mask_size);
94 }
95
99 void cuda_rzero(void *a, int *n) {
100 CUDA_CHECK(cudaMemsetAsync(a, 0, (*n) * sizeof(real),
101 (cudaStream_t) glb_cmd_queue));
102 }
103
107 void cuda_cmult(void *a, real *c, int *n) {
108
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
111
113 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
115
116 }
117
121 void cuda_cmult2(void *a, void *b, real *c, int *n) {
122
123 const dim3 nthrds(1024, 1, 1);
124 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
125
127 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
129
130 }
131
135 void cuda_cadd(void *a, real *c, int *n) {
136
137 const dim3 nthrds(1024, 1, 1);
138 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
139
141 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
143
144 }
145
150 void cuda_cadd2(void *a, void *b, real *c, int *n) {
151
152 const dim3 nthrds(1024, 1, 1);
153 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
154
156 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c, *n);
158
159 }
160
164 void cuda_cfill(void *a, real *c, int *n) {
165
166 const dim3 nthrds(1024, 1, 1);
167 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
168
169 if (*n > 0){
171 (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
173 }
174
175 }
176
181 void cuda_add2(void *a, void *b, int *n) {
182
183 const dim3 nthrds(1024, 1, 1);
184 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
185
187 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
189
190 }
191
196 void cuda_add3(void *a, void *b, void *c, int *n) {
197
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
200
202 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
203 (real *) c, *n);
205 }
206
211 void cuda_add4(void *a, void *b, void *c, void *d, int *n) {
212
213 const dim3 nthrds(1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
215
217 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, (real *) d, *n);
219
220 }
226 void cuda_add2s1(void *a, void *b, real *c1, int *n) {
227
228 const dim3 nthrds(1024, 1, 1);
229 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
230
232 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
234
235 }
236
242 void cuda_add2s2(void *a, void *b, real *c1, int *n) {
243
244 const dim3 nthrds(1024, 1, 1);
245 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
246
248 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
250
251 }
252
259 void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
260
261 const dim3 nthrds(1024, 1, 1);
262 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
263
265 (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
266 (real *) alpha, *j, *n);
268
269 }
270
276 void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
277
278 const dim3 nthrds(1024, 1, 1);
279 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
280
281 addsqr2s2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
282 (real *) b,
283 *c1, *n);
285
286 }
287
293 void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
294
295 const dim3 nthrds(1024, 1, 1);
296 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
297
299 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c,
300 *c1, *c2, *n);
302
303 }
304
309 void cuda_invcol1(void *a, int *n) {
310
311 const dim3 nthrds(1024, 1, 1);
312 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
313
315 (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
317 }
318
323 void cuda_invcol2(void *a, void *b, int *n) {
324
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327
328 invcol2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t) glb_cmd_queue>>>((real *) a,
329 (real *) b, *n);
331 }
332
337 void cuda_col2(void *a, void *b, int *n) {
338
339 const dim3 nthrds(1024, 1, 1);
340 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
341
343 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
345 }
346
351 void cuda_col3(void *a, void *b, void *c, int *n) {
352
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355
357 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
359 }
360
365 void cuda_subcol3(void *a, void *b, void *c, int *n) {
366
367 const dim3 nthrds(1024, 1, 1);
368 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
369
371 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
373 }
374
375
380 void cuda_sub2(void *a, void *b, int *n) {
381
382 const dim3 nthrds(1024, 1, 1);
383 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
384
386 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
388 }
389
394 void cuda_sub3(void *a, void *b, void *c, int *n) {
395
396 const dim3 nthrds(1024, 1, 1);
397 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
398
400 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
401 (real *) c, *n);
403 }
404
409 void cuda_addcol3(void *a, void *b, void *c, int *n) {
410
411 const dim3 nthrds(1024, 1, 1);
412 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
413
415 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
416 (real *) c, *n);
418 }
419
424 void cuda_addcol4(void *a, void *b, void *c, void *d, int *n) {
425
426 const dim3 nthrds(1024, 1, 1);
427 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
428
430 (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
431 (real *) c, (real *) d, *n);
433 }
434
439 void cuda_vdot3(void *dot, void *u1, void *u2, void *u3,
440 void *v1, void *v2, void *v3, int *n) {
441
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444
446 (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
447 (real *) u2, (real *) u3,
448 (real *) v1, (real *) v2,
449 (real *) v3, *n);
451 }
452
457 void cuda_vcross(void *u1, void *u2, void *u3,
458 void *v1, void *v2, void *v3,
459 void *w1, void *w2, void *w3, int *n) {
460
461 const dim3 nthrds(1024, 1, 1);
462 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
463
465 (cudaStream_t) glb_cmd_queue>>>((real *) u1,
466 (real *) u2, (real *) u3,
467 (real *) v1, (real *) v2,
468 (real *) v3,
469 (real *) w1, (real *) w2,
470 (real *) w3, *n);
472 }
473
474
475 /*
476 * Reduction buffer
477 */
478 int red_s = 0;
481
486 real cuda_vlsc3(void *u, void *v, void *w, int *n) {
487
488 const dim3 nthrds(1024, 1, 1);
489 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const int nb = ((*n) + 1024 - 1)/ 1024;
491 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
492
493 if ( nb > red_s){
494 red_s = nb;
495 if (bufred != NULL) {
498 }
501 }
502
503 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
504 ((real *) u, (real *) v, (real *) w, bufred_d, *n);
506 reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
508
510 cudaMemcpyDeviceToHost, stream));
511 cudaStreamSynchronize(stream);
512
513 return bufred[0];
514 }
515
520 real cuda_glsc3(void *a, void *b, void *c, int *n) {
521
522 const dim3 nthrds(1024, 1, 1);
523 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
524 const int nb = ((*n) + 1024 - 1)/ 1024;
525 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
526
527 if ( nb > red_s){
528 red_s = nb;
529 if (bufred != NULL) {
532 }
535 }
536
537 glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
538 ((real *) a, (real *) b, (real *) c, bufred_d, *n);
540 reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
542
543#ifdef HAVE_DEVICE_MPI
544 cudaStreamSynchronize(stream);
546#else
548 cudaMemcpyDeviceToHost, stream));
549 cudaStreamSynchronize(stream);
550#endif
551
552 return bufred[0];
553 }
554
559 void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
560 int pow2 = 1;
561 while(pow2 < (*j)){
562 pow2 = 2*pow2;
563 }
564 const int nt = 1024/pow2;
565 const dim3 nthrds(pow2, nt, 1);
566 const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
567 const int nb = ((*n) + nt - 1)/nt;
568 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
569
570 if((*j)*nb>red_s){
571 red_s = (*j)*nb;
572 if (bufred != NULL) {
575 }
576 CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
577 CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
578 }
579
580 glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
581 ((const real *) w, (const real **) v,
582 (const real *)mult, bufred_d, *j, *n);
584 glsc3_reduce_kernel<real><<<(*j), 1024, 0, stream>>>(bufred_d, nb, *j);
586
587#ifdef HAVE_DEVICE_MPI
588 cudaStreamSynchronize(stream);
590#else
591 CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
592 cudaMemcpyDeviceToHost, stream));
593 cudaStreamSynchronize(stream);
594#endif
595 }
596
601 real cuda_glsc2(void *a, void *b, int *n) {
602
603 const dim3 nthrds(1024, 1, 1);
604 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
605 const int nb = ((*n) + 1024 - 1)/ 1024;
606 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
607
608 if ( nb > red_s){
609 red_s = nb;
610 if (bufred != NULL) {
613 }
616 }
617
619 <<<nblcks, nthrds, 0, stream>>>((real *) a,
620 (real *) b,
621 bufred_d, *n);
623 reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
625
626#ifdef HAVE_DEVICE_MPI
627 cudaStreamSynchronize(stream);
629#else
631 cudaMemcpyDeviceToHost, stream));
632 cudaStreamSynchronize(stream);
633#endif
634
635 return bufred[0];
636 }
637
642 real cuda_glsum(void *a, int *n) {
643 const dim3 nthrds(1024, 1, 1);
644 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
645 const int nb = ((*n) + 1024 - 1)/ 1024;
646 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
647
648 if ( nb > red_s){
649 red_s = nb;
650 if (bufred != NULL) {
653 }
656 }
657 if ( *n > 0) {
659 <<<nblcks, nthrds, 0, stream>>>((real *) a, bufred_d, *n);
661 reduce_kernel<real><<<1, 1024, 0, stream>>> (bufred_d, nb);
663 }
664#ifdef HAVE_DEVICE_MPI
665 cudaStreamSynchronize(stream);
667#else
669 cudaMemcpyDeviceToHost, stream));
670 cudaStreamSynchronize(stream);
671#endif
672
673 return bufred[0];
674 }
675
680 void cuda_absval(void *a, int *n) {
681
682 const dim3 nthrds(1024, 1, 1);
683 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
684 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
685
687 <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
689
690 }
691
692 // ======================================================================== //
693 // Point-wise operations.
694
699 void cuda_pwmax_vec2(void *a, void *b, int *n) {
700
701 const dim3 nthrds(1024, 1, 1);
702 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
703 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
704
705 pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
706 (real *)a, (real *)b, *n);
708 }
709
714 void cuda_pwmax_vec3(void *a, void *b, void *c, int *n) {
715
716 const dim3 nthrds(1024, 1, 1);
717 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
718 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
719
720 pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
721 (real *)a, (real *)b, (real *)c, *n);
723 }
724
729 void cuda_pwmax_sca2(void *a, real *c, int *n) {
730
731 const dim3 nthrds(1024, 1, 1);
732 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
733 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
734
735 pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
736 (real *)a, *c, *n);
738 }
739
744 void cuda_pwmax_sca3(void *a, void *b, real *c, int *n) {
745
746 const dim3 nthrds(1024, 1, 1);
747 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
748 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
749
750 pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
751 (real *)a, (real *)b, *c, *n);
753 }
754
759 void cuda_pwmin_vec2(void *a, void *b, int *n) {
760
761 const dim3 nthrds(1024, 1, 1);
762 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
763 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
764
765 pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
766 (real *)a, (real *)b, *n);
768 }
769
774 void cuda_pwmin_vec3(void *a, void *b, void *c, int *n) {
775
776 const dim3 nthrds(1024, 1, 1);
777 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
778 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
779
780 pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
781 (real *)a, (real *)b, (real *)c, *n);
783 }
784
789 void cuda_pwmin_sca2(void *a, real *c, int *n) {
790
791 const dim3 nthrds(1024, 1, 1);
792 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
793 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
794
795 pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
796 (real *)a, *c, *n);
798 }
799
804 void cuda_pwmin_sca3(void *a, void *b, real *c, int *n) {
805
806 const dim3 nthrds(1024, 1, 1);
807 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
808 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
809
810 pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
811 (real *)a, (real *)b, *c, *n);
813 }
814
815} /* extern "C" */
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
#define CUDA_CHECK(err)
Definition check.h:6
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
double real
#define DEVICE_MPI_SUM
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
void cuda_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.cu:72
void cuda_absval(void *a, int *n)
Definition math.cu:680
void cuda_invcol1(void *a, int *n)
Definition math.cu:309
void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition math.cu:259
void cuda_cadd2(void *a, void *b, real *c, int *n)
Definition math.cu:150
void cuda_pwmax_sca3(void *a, void *b, real *c, int *n)
Definition math.cu:744
void cuda_pwmin_vec3(void *a, void *b, void *c, int *n)
Definition math.cu:774
real cuda_vlsc3(void *u, void *v, void *w, int *n)
Definition math.cu:486
void cuda_add2s2(void *a, void *b, real *c1, int *n)
Definition math.cu:242
void cuda_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition math.cu:58
void cuda_add3(void *a, void *b, void *c, int *n)
Definition math.cu:196
void cuda_col2(void *a, void *b, int *n)
Definition math.cu:337
void cuda_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition math.cu:559
void cuda_add4(void *a, void *b, void *c, void *d, int *n)
Definition math.cu:211
void cuda_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition math.cu:439
void cuda_pwmin_sca3(void *a, void *b, real *c, int *n)
Definition math.cu:804
void cuda_addcol3(void *a, void *b, void *c, int *n)
Definition math.cu:409
void cuda_pwmax_sca2(void *a, real *c, int *n)
Definition math.cu:729
void cuda_subcol3(void *a, void *b, void *c, int *n)
Definition math.cu:365
void cuda_cmult(void *a, real *c, int *n)
Definition math.cu:107
real * bufred
Definition math.cu:479
void cuda_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition math.cu:276
void cuda_add2s1(void *a, void *b, real *c1, int *n)
Definition math.cu:226
real cuda_glsum(void *a, int *n)
Definition math.cu:642
real cuda_glsc2(void *a, void *b, int *n)
Definition math.cu:601
void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition math.cu:293
int red_s
Definition math.cu:478
void cuda_rzero(void *a, int *n)
Definition math.cu:99
void cuda_pwmin_sca2(void *a, real *c, int *n)
Definition math.cu:789
void cuda_pwmax_vec3(void *a, void *b, void *c, int *n)
Definition math.cu:714
void cuda_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition math.cu:424
void cuda_add2(void *a, void *b, int *n)
Definition math.cu:181
void cuda_copy(void *a, void *b, int *n)
Definition math.cu:49
void cuda_pwmax_vec2(void *a, void *b, int *n)
Definition math.cu:699
void cuda_cfill_mask(void *a, real *c, int *size, int *mask, int *mask_size)
Definition math.cu:86
void cuda_invcol2(void *a, void *b, int *n)
Definition math.cu:323
void cuda_pwmin_vec2(void *a, void *b, int *n)
Definition math.cu:759
void cuda_col3(void *a, void *b, void *c, int *n)
Definition math.cu:351
void cuda_cfill(void *a, real *c, int *n)
Definition math.cu:164
void cuda_cadd(void *a, real *c, int *n)
Definition math.cu:135
void cuda_sub2(void *a, void *b, int *n)
Definition math.cu:380
void cuda_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition math.cu:457
real cuda_glsc3(void *a, void *b, void *c, int *n)
Definition math.cu:520
real * bufred_d
Definition math.cu:480
void cuda_cmult2(void *a, void *b, real *c, int *n)
Definition math.cu:121
void cuda_sub3(void *a, void *b, void *c, int *n)
Definition math.cu:394