Loading [MathJax]/extensions/tex2jax.js
Neko 0.9.99
A portable framework for high-order spectral element flow simulations
All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros Pages
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int m) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < m; i += str) {
67 a[mask[i+1]-1] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int m) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < m; i += str) {
85 a[i] = b[mask[i+1]-1];
86 }
87}
88
92template< typename T >
94 T * __restrict__ b,
95 int * __restrict__ mask,
96 const int n,
97 const int m) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < m; i += str) {
103 unsafeAtomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
104 //atomicAdd( &(a[mask[i+1]-1]), b[i]);//a[mask[i]-1] = a[mask[i]-1] + b[i];
105 }
106}
107
111template< typename T >
113 const T c,
114 const int size,
115 int* __restrict__ mask,
116 const int mask_size) {
117
118 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
119 const int str = blockDim.x * gridDim.x;
120
121 for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
122}
123
127template< typename T >
129 T * __restrict__ b,
130 const T c,
131 const int n) {
132
133 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
134 const int str = blockDim.x * gridDim.x;
135
136 for (int i = idx; i < n; i += str) {
137 a[i] = c * b[i];
138 }
139}
140
144template< typename T >
146 const T c,
147 const int n) {
148
149 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
150 const int str = blockDim.x * gridDim.x;
151
152 for (int i = idx; i < n; i += str) {
153 a[i] = a[i] + c;
154 }
155}
156
160template< typename T >
162 T * __restrict__ b,
163 const T c,
164 const int n) {
165
166 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
167 const int str = blockDim.x * gridDim.x;
168
169 for (int i = idx; i < n; i += str) {
170 a[i] = b[i] + c;
171 }
172}
173
177template< typename T >
179 const T c,
180 const int n) {
181
182 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
183 const int str = blockDim.x * gridDim.x;
184
185 for (int i = idx; i < n; i += str) {
186 a[i] = c;
187 }
188}
189
193template< typename T >
195 const T * __restrict__ b,
196 const int n) {
197
198 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
199 const int str = blockDim.x * gridDim.x;
200
201 for (int i = idx; i < n; i += str) {
202 a[i] = a[i] + b[i];
203 }
204}
205
209template< typename T >
211 const T * __restrict__ b,
212 const T * __restrict__ c,
213 const int n) {
214
215 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
216 const int str = blockDim.x * gridDim.x;
217
218 for (int i = idx; i < n; i += str) {
219 a[i] = b[i] + c[i];
220 }
221}
222
226template< typename T >
228 const T * __restrict__ b,
229 const T * __restrict__ c,
230 const T * __restrict__ d,
231 const int n) {
232
233 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
234 const int str = blockDim.x * gridDim.x;
235
236 for (int i = idx; i < n; i += str) {
237 a[i] = b[i] + c[i] + d[i];
238 }
239}
240
244template< typename T >
246 const T * __restrict__ b,
247 const T c1,
248 const int n) {
249
250 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
251 const int str = blockDim.x * gridDim.x;
252
253 for (int i = idx; i < n; i += str) {
254 a[i] = c1 * a[i] + b[i];
255 }
256}
257
261template< typename T >
263 const T ** p,
264 const T * alpha,
265 const int p_cur,
266 const int n) {
267
268 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
269 const int str = blockDim.x * gridDim.x;
270
271
272 for (int i = idx; i < n; i+= str) {
273 T tmp = 0.0;
274 for (int j = 0; j < p_cur; j ++) {
275 tmp += p[j][i]*alpha[j];
276 }
277 x[i] += tmp;
278 }
279}
280
284template< typename T >
286 const T * __restrict__ b,
287 const T c1,
288 const int n) {
289
290 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
291 const int str = blockDim.x * gridDim.x;
292
293 for (int i = idx; i < n; i += str) {
294 a[i] = a[i] + c1 * b[i];
295 }
296}
297
301template< typename T >
303 const T * __restrict__ b,
304 const T c1,
305 const int n) {
306
307 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
308 const int str = blockDim.x * gridDim.x;
309
310 for (int i = idx; i < n; i += str) {
311 a[i] = a[i] + c1 * (b[i] * b[i]);
312 }
313}
314
318template< typename T >
320 const T * __restrict__ b,
321 const T * __restrict__ c,
322 const T c1,
323 const T c2,
324 const int n) {
325
326 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
327 const int str = blockDim.x * gridDim.x;
328
329 for (int i = idx; i < n; i += str) {
330 a[i] = c1 * b[i] + c2 * c[i];
331 }
332}
333
337template< typename T >
339 const int n) {
340
341 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
342 const int str = blockDim.x * gridDim.x;
343 const T one = 1.0;
344
345 for (int i = idx; i < n; i += str) {
346 a[i] = one / a[i];
347 }
348}
349
353template< typename T >
355 const T * __restrict__ b,
356 const int n) {
357
358 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
359 const int str = blockDim.x * gridDim.x;
360
361 for (int i = idx; i < n; i += str) {
362 a[i] = a[i] / b[i];
363 }
364}
365
369template< typename T >
371 const T * __restrict__ b,
372 const int n) {
373
374 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
375 const int str = blockDim.x * gridDim.x;
376
377 for (int i = idx; i < n; i += str) {
378 a[i] = a[i] * b[i];
379 }
380}
381
385template< typename T >
387 const T * __restrict__ b,
388 const T * __restrict__ c,
389 const int n) {
390
391 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
392 const int str = blockDim.x * gridDim.x;
393
394 for (int i = idx; i < n; i += str) {
395 a[i] = b[i] * c[i];
396 }
397}
398
402template< typename T >
404 const T * __restrict__ b,
405 const T * __restrict__ c,
406 const int n) {
407
408 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
409 const int str = blockDim.x * gridDim.x;
410
411 for (int i = idx; i < n; i += str) {
412 a[i] = a[i] - b[i] * c[i];
413 }
414}
415
419template< typename T >
421 const T * __restrict__ b,
422 const int n) {
423
424 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425 const int str = blockDim.x * gridDim.x;
426
427 for (int i = idx; i < n; i += str) {
428 a[i] = a[i] - b[i];
429 }
430}
431
435template< typename T >
437 const T * __restrict__ b,
438 const T * __restrict__ c,
439 const int n) {
440
441 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
442 const int str = blockDim.x * gridDim.x;
443
444 for (int i = idx; i < n; i += str) {
445 a[i] = b[i] - c[i];
446 }
447}
448
452template< typename T >
454 const T * __restrict__ b,
455 const T * __restrict__ c,
456 const int n) {
457
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
460
461 for (int i = idx; i < n; i += str) {
462 a[i] = a[i] + b[i] * c[i];
463 }
464
465}
466
470template< typename T >
472 const T * __restrict__ b,
473 const T * __restrict__ c,
474 const T * __restrict__ d,
475 const int n) {
476
477 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
478 const int str = blockDim.x * gridDim.x;
479
480 for (int i = idx; i < n; i += str) {
481 a[i] = a[i] + b[i] * c[i] * d[i];
482 }
483
484}
485
489template< typename T >
491 const T * __restrict__ u1,
492 const T * __restrict__ u2,
493 const T * __restrict__ u3,
494 const T * __restrict__ v1,
495 const T * __restrict__ v2,
496 const T * __restrict__ v3,
497 const int n) {
498
499 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
500 const int str = blockDim.x * gridDim.x;
501
502 for (int i = idx; i < n; i += str) {
503 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
504 }
505
506}
507
511template< typename T >
513 T * __restrict__ u2,
514 T * __restrict__ u3,
515 const T * __restrict__ v1,
516 const T * __restrict__ v2,
517 const T * __restrict__ v3,
518 const T * __restrict__ w1,
519 const T * __restrict__ w2,
520 const T * __restrict__ w3,
521 const int n) {
522
523 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
524 const int str = blockDim.x * gridDim.x;
525
526 for (int i = idx; i < n; i += str) {
527 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
528 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
529 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
530 }
531}
532
536template< typename T>
538 val += __shfl_down(val, 32);
539 val += __shfl_down(val, 16);
540 val += __shfl_down(val, 8);
541 val += __shfl_down(val, 4);
542 val += __shfl_down(val, 2);
543 val += __shfl_down(val, 1);
544 return val;
545}
546
550template< typename T >
551__global__ void reduce_kernel(T * bufred, const int n) {
552
553 T sum = 0;
554 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
555 const int str = blockDim.x * gridDim.x;
556 for (int i = idx; i<n ; i += str)
557 {
558 sum += bufred[i];
559 }
560
561 __shared__ T shared[64];
562 unsigned int lane = threadIdx.x % warpSize;
563 unsigned int wid = threadIdx.x / warpSize;
564
566 if (lane == 0)
567 shared[wid] = sum;
569
570 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
571 if (wid == 0)
573
574 if (threadIdx.x == 0)
575 bufred[blockIdx.x] = sum;
576}
577
582template< typename T >
584 const int n,
585 const int j
586 ) {
587 __shared__ T buf[1024] ;
588 const int idx = threadIdx.x;
589 const int y= blockIdx.x;
590 const int step = blockDim.x;
591
592 buf[idx]=0;
593 for (int i=idx ; i<n ; i+=step)
594 {
595 buf[idx] += bufred[i*j + y];
596 }
598
599 int i = 512;
600 while (i != 0)
601 {
602 if(threadIdx.x < i && (threadIdx.x + i) < n )
603 {
604 buf[threadIdx.x] += buf[threadIdx.x + i] ;
605 }
606 i = i>>1;
608 }
609
610 bufred[y] = buf[0];
611}
612
616template< typename T >
618 const T * b,
619 const T * c,
620 T * buf_h,
621 const int n) {
622
623 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
624 const int str = blockDim.x * gridDim.x;
625
626 const unsigned int lane = threadIdx.x % warpSize;
627 const unsigned int wid = threadIdx.x / warpSize;
628
629 __shared__ T shared[64];
630 T sum = 0.0;
631 for (int i = idx; i < n; i+= str) {
632 sum += a[i] * b[i] * c[i];
633 }
634
636 if (lane == 0)
637 shared[wid] = sum;
639
640 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
641 if (wid == 0)
643
644 if (threadIdx.x == 0)
645 buf_h[blockIdx.x] = sum;
646}
647
651template< typename T >
653 const T ** b,
654 const T * c,
655 T * buf_h,
656 const int j,
657 const int n) {
658
659 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
660 const int str = blockDim.y * gridDim.x;
661 const int y = threadIdx.x;
662
663 __shared__ T buf[1024];
664 T tmp = 0;
665 if(y < j){
666 for (int i = idx; i < n; i+= str) {
667 tmp += a[i] * b[threadIdx.x][i] * c[i];
668 }
669 }
670
671 buf[threadIdx.y*blockDim.x+y] = tmp;
673
674 int i = blockDim.y>>1;
675 while (i != 0) {
676 if (threadIdx.y < i) {
677 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
678 }
680 i = i>>1;
681 }
682 if (threadIdx.y == 0) {
683 if( y < j) {
684 buf_h[j*blockIdx.x+y] = buf[y];
685 }
686 }
687}
688
689
693template< typename T >
695 const T * b,
696 T * buf_h,
697 const int n) {
698
699 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
700 const int str = blockDim.x * gridDim.x;
701
702 const unsigned int lane = threadIdx.x % warpSize;
703 const unsigned int wid = threadIdx.x / warpSize;
704
705 __shared__ T shared[64];
706 T sum = 0.0;
707 for (int i = idx; i < n; i+= str) {
708 sum += a[i] * b[i];
709 }
710
712 if (lane == 0)
713 shared[wid] = sum;
715
716 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
717 if (wid == 0)
719
720 if (threadIdx.x == 0)
721 buf_h[blockIdx.x] = sum;
722}
723
727template< typename T >
729 T * buf_h,
730 const int n) {
731
732 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
733 const int str = blockDim.x * gridDim.x;
734
735 const unsigned int lane = threadIdx.x % warpSize;
736 const unsigned int wid = threadIdx.x / warpSize;
737
738 __shared__ T shared[64];
739 T sum = 0;
740 for (int i = idx; i<n ; i += str)
741 {
742 sum += a[i];
743 }
744
746 if (lane == 0)
747 shared[wid] = sum;
749
750 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
751 if (wid == 0)
753
754 if (threadIdx.x == 0)
755 buf_h[blockIdx.x] = sum;
756
757}
758
762template< typename T >
764 const int n) {
765
766 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
767 const int str = blockDim.x * gridDim.x;
768
769 for (int i = idx; i < n; i += str) {
770 a[i] = fabs(a[i]);
771 }
772}
773
774#endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_atomic_reduction_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:76
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:96
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:58
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:505
real * buf
Definition pipecg_aux.cu:42