Neko 0.9.99
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
math_kernel.h
Go to the documentation of this file.
1#ifndef __MATH_MATH_KERNEL_H__
2#define __MATH_MATH_KERNEL_H__
3/*
4 Copyright (c) 2021-2023, The Neko Authors
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following
16 disclaimer in the documentation and/or other materials provided
17 with the distribution.
18
19 * Neither the name of the authors nor the names of its
20 contributors may be used to endorse or promote products derived
21 from this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35*/
36
40template< typename T >
42 const T c,
43 const int n) {
44
45 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46 const int str = blockDim.x * gridDim.x;
47
48 for (int i = idx; i < n; i += str) {
49 a[i] = c * a[i];
50 }
51}
52
56template< typename T >
58 T * __restrict__ b,
59 int * __restrict__ mask,
60 const int n,
61 const int m) {
62
63 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64 const int str = blockDim.x * gridDim.x;
65
66 for (int i = idx; i < m; i += str) {
67 a[mask[i+1]-1] = b[mask[i+1]-1];
68 }
69}
70
74template< typename T >
76 T * __restrict__ b,
77 int * __restrict__ mask,
78 const int n,
79 const int m) {
80
81 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82 const int str = blockDim.x * gridDim.x;
83
84 for (int i = idx; i < m; i += str) {
85 a[i] = b[mask[i+1]-1];
86 }
87}
88
92template< typename T >
94 const T c,
95 const int size,
96 int* __restrict__ mask,
97 const int mask_size) {
98
99 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
100 const int str = blockDim.x * gridDim.x;
101
102 for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
103}
104
108template< typename T >
110 T * __restrict__ b,
111 const T c,
112 const int n) {
113
114 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
115 const int str = blockDim.x * gridDim.x;
116
117 for (int i = idx; i < n; i += str) {
118 a[i] = c * b[i];
119 }
120}
121
125template< typename T >
127 const T c,
128 const int n) {
129
130 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
131 const int str = blockDim.x * gridDim.x;
132
133 for (int i = idx; i < n; i += str) {
134 a[i] = a[i] + c;
135 }
136}
137
141template< typename T >
143 T * __restrict__ b,
144 const T c,
145 const int n) {
146
147 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
148 const int str = blockDim.x * gridDim.x;
149
150 for (int i = idx; i < n; i += str) {
151 a[i] = b[i] + c;
152 }
153}
154
158template< typename T >
160 const T c,
161 const int n) {
162
163 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
164 const int str = blockDim.x * gridDim.x;
165
166 for (int i = idx; i < n; i += str) {
167 a[i] = c;
168 }
169}
170
174template< typename T >
176 const T * __restrict__ b,
177 const int n) {
178
179 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
180 const int str = blockDim.x * gridDim.x;
181
182 for (int i = idx; i < n; i += str) {
183 a[i] = a[i] + b[i];
184 }
185}
186
190template< typename T >
192 const T * __restrict__ b,
193 const T * __restrict__ c,
194 const int n) {
195
196 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
197 const int str = blockDim.x * gridDim.x;
198
199 for (int i = idx; i < n; i += str) {
200 a[i] = b[i] + c[i];
201 }
202}
203
207template< typename T >
209 const T * __restrict__ b,
210 const T * __restrict__ c,
211 const T * __restrict__ d,
212 const int n) {
213
214 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
215 const int str = blockDim.x * gridDim.x;
216
217 for (int i = idx; i < n; i += str) {
218 a[i] = b[i] + c[i] + d[i];
219 }
220}
221
225template< typename T >
227 const T * __restrict__ b,
228 const T c1,
229 const int n) {
230
231 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
232 const int str = blockDim.x * gridDim.x;
233
234 for (int i = idx; i < n; i += str) {
235 a[i] = c1 * a[i] + b[i];
236 }
237}
238
242template< typename T >
244 const T ** p,
245 const T * alpha,
246 const int p_cur,
247 const int n) {
248
249 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
250 const int str = blockDim.x * gridDim.x;
251
252
253 for (int i = idx; i < n; i+= str) {
254 T tmp = 0.0;
255 for (int j = 0; j < p_cur; j ++) {
256 tmp += p[j][i]*alpha[j];
257 }
258 x[i] += tmp;
259 }
260}
261
265template< typename T >
267 const T * __restrict__ b,
268 const T c1,
269 const int n) {
270
271 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272 const int str = blockDim.x * gridDim.x;
273
274 for (int i = idx; i < n; i += str) {
275 a[i] = a[i] + c1 * b[i];
276 }
277}
278
282template< typename T >
284 const T * __restrict__ b,
285 const T c1,
286 const int n) {
287
288 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
289 const int str = blockDim.x * gridDim.x;
290
291 for (int i = idx; i < n; i += str) {
292 a[i] = a[i] + c1 * (b[i] * b[i]);
293 }
294}
295
299template< typename T >
301 const T * __restrict__ b,
302 const T * __restrict__ c,
303 const T c1,
304 const T c2,
305 const int n) {
306
307 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
308 const int str = blockDim.x * gridDim.x;
309
310 for (int i = idx; i < n; i += str) {
311 a[i] = c1 * b[i] + c2 * c[i];
312 }
313}
314
318template< typename T >
320 const int n) {
321
322 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
323 const int str = blockDim.x * gridDim.x;
324 const T one = 1.0;
325
326 for (int i = idx; i < n; i += str) {
327 a[i] = one / a[i];
328 }
329}
330
334template< typename T >
336 const T * __restrict__ b,
337 const int n) {
338
339 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
340 const int str = blockDim.x * gridDim.x;
341
342 for (int i = idx; i < n; i += str) {
343 a[i] = a[i] / b[i];
344 }
345}
346
350template< typename T >
352 const T * __restrict__ b,
353 const int n) {
354
355 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
356 const int str = blockDim.x * gridDim.x;
357
358 for (int i = idx; i < n; i += str) {
359 a[i] = a[i] * b[i];
360 }
361}
362
366template< typename T >
368 const T * __restrict__ b,
369 const T * __restrict__ c,
370 const int n) {
371
372 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
373 const int str = blockDim.x * gridDim.x;
374
375 for (int i = idx; i < n; i += str) {
376 a[i] = b[i] * c[i];
377 }
378}
379
383template< typename T >
385 const T * __restrict__ b,
386 const T * __restrict__ c,
387 const int n) {
388
389 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
390 const int str = blockDim.x * gridDim.x;
391
392 for (int i = idx; i < n; i += str) {
393 a[i] = a[i] - b[i] * c[i];
394 }
395}
396
400template< typename T >
402 const T * __restrict__ b,
403 const int n) {
404
405 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
406 const int str = blockDim.x * gridDim.x;
407
408 for (int i = idx; i < n; i += str) {
409 a[i] = a[i] - b[i];
410 }
411}
412
416template< typename T >
418 const T * __restrict__ b,
419 const T * __restrict__ c,
420 const int n) {
421
422 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
423 const int str = blockDim.x * gridDim.x;
424
425 for (int i = idx; i < n; i += str) {
426 a[i] = b[i] - c[i];
427 }
428}
429
433template< typename T >
435 const T * __restrict__ b,
436 const T * __restrict__ c,
437 const int n) {
438
439 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
440 const int str = blockDim.x * gridDim.x;
441
442 for (int i = idx; i < n; i += str) {
443 a[i] = a[i] + b[i] * c[i];
444 }
445
446}
447
451template< typename T >
453 const T * __restrict__ b,
454 const T * __restrict__ c,
455 const T * __restrict__ d,
456 const int n) {
457
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
460
461 for (int i = idx; i < n; i += str) {
462 a[i] = a[i] + b[i] * c[i] * d[i];
463 }
464
465}
466
470template< typename T >
472 const T * __restrict__ u1,
473 const T * __restrict__ u2,
474 const T * __restrict__ u3,
475 const T * __restrict__ v1,
476 const T * __restrict__ v2,
477 const T * __restrict__ v3,
478 const int n) {
479
480 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
481 const int str = blockDim.x * gridDim.x;
482
483 for (int i = idx; i < n; i += str) {
484 dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
485 }
486
487}
488
492template< typename T >
494 T * __restrict__ u2,
495 T * __restrict__ u3,
496 const T * __restrict__ v1,
497 const T * __restrict__ v2,
498 const T * __restrict__ v3,
499 const T * __restrict__ w1,
500 const T * __restrict__ w2,
501 const T * __restrict__ w3,
502 const int n) {
503
504 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
505 const int str = blockDim.x * gridDim.x;
506
507 for (int i = idx; i < n; i += str) {
508 u1[i] = v2[i]*w3[i] - v3[i]*w2[i];
509 u2[i] = v3[i]*w1[i] - v1[i]*w3[i];
510 u3[i] = v1[i]*w2[i] - v2[i]*w1[i];
511 }
512}
513
517template< typename T>
519 val += __shfl_down(val, 32);
520 val += __shfl_down(val, 16);
521 val += __shfl_down(val, 8);
522 val += __shfl_down(val, 4);
523 val += __shfl_down(val, 2);
524 val += __shfl_down(val, 1);
525 return val;
526}
527
531template< typename T >
532__global__ void reduce_kernel(T * bufred, const int n) {
533
534 T sum = 0;
535 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
536 const int str = blockDim.x * gridDim.x;
537 for (int i = idx; i<n ; i += str)
538 {
539 sum += bufred[i];
540 }
541
542 __shared__ T shared[64];
543 unsigned int lane = threadIdx.x % warpSize;
544 unsigned int wid = threadIdx.x / warpSize;
545
547 if (lane == 0)
548 shared[wid] = sum;
550
551 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
552 if (wid == 0)
554
555 if (threadIdx.x == 0)
556 bufred[blockIdx.x] = sum;
557}
558
563template< typename T >
565 const int n,
566 const int j
567 ) {
568 __shared__ T buf[1024] ;
569 const int idx = threadIdx.x;
570 const int y= blockIdx.x;
571 const int step = blockDim.x;
572
573 buf[idx]=0;
574 for (int i=idx ; i<n ; i+=step)
575 {
576 buf[idx] += bufred[i*j + y];
577 }
579
580 int i = 512;
581 while (i != 0)
582 {
583 if(threadIdx.x < i && (threadIdx.x + i) < n )
584 {
585 buf[threadIdx.x] += buf[threadIdx.x + i] ;
586 }
587 i = i>>1;
589 }
590
591 bufred[y] = buf[0];
592}
593
597template< typename T >
599 const T * b,
600 const T * c,
601 T * buf_h,
602 const int n) {
603
604 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
605 const int str = blockDim.x * gridDim.x;
606
607 const unsigned int lane = threadIdx.x % warpSize;
608 const unsigned int wid = threadIdx.x / warpSize;
609
610 __shared__ T shared[64];
611 T sum = 0.0;
612 for (int i = idx; i < n; i+= str) {
613 sum += a[i] * b[i] * c[i];
614 }
615
617 if (lane == 0)
618 shared[wid] = sum;
620
621 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
622 if (wid == 0)
624
625 if (threadIdx.x == 0)
626 buf_h[blockIdx.x] = sum;
627}
628
632template< typename T >
634 const T ** b,
635 const T * c,
636 T * buf_h,
637 const int j,
638 const int n) {
639
640 const int idx = blockIdx.x * blockDim.y + threadIdx.y;
641 const int str = blockDim.y * gridDim.x;
642 const int y = threadIdx.x;
643
644 __shared__ T buf[1024];
645 T tmp = 0;
646 if(y < j){
647 for (int i = idx; i < n; i+= str) {
648 tmp += a[i] * b[threadIdx.x][i] * c[i];
649 }
650 }
651
652 buf[threadIdx.y*blockDim.x+y] = tmp;
654
655 int i = blockDim.y>>1;
656 while (i != 0) {
657 if (threadIdx.y < i) {
658 buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
659 }
661 i = i>>1;
662 }
663 if (threadIdx.y == 0) {
664 if( y < j) {
665 buf_h[j*blockIdx.x+y] = buf[y];
666 }
667 }
668}
669
670
674template< typename T >
676 const T * b,
677 T * buf_h,
678 const int n) {
679
680 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
681 const int str = blockDim.x * gridDim.x;
682
683 const unsigned int lane = threadIdx.x % warpSize;
684 const unsigned int wid = threadIdx.x / warpSize;
685
686 __shared__ T shared[64];
687 T sum = 0.0;
688 for (int i = idx; i < n; i+= str) {
689 sum += a[i] * b[i];
690 }
691
693 if (lane == 0)
694 shared[wid] = sum;
696
697 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
698 if (wid == 0)
700
701 if (threadIdx.x == 0)
702 buf_h[blockIdx.x] = sum;
703}
704
708template< typename T >
710 T * buf_h,
711 const int n) {
712
713 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
714 const int str = blockDim.x * gridDim.x;
715
716 const unsigned int lane = threadIdx.x % warpSize;
717 const unsigned int wid = threadIdx.x / warpSize;
718
719 __shared__ T shared[64];
720 T sum = 0;
721 for (int i = idx; i<n ; i += str)
722 {
723 sum += a[i];
724 }
725
727 if (lane == 0)
728 shared[wid] = sum;
730
731 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
732 if (wid == 0)
734
735 if (threadIdx.x == 0)
736 buf_h[blockIdx.x] = sum;
737
738}
739
743template< typename T >
745 const int n) {
746
747 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
748 const int str = blockDim.x * gridDim.x;
749
750 for (int i = idx; i < n; i += str) {
751 a[i] = fabs(a[i]);
752 }
753}
754
755#endif // __MATH_MATH_KERNEL_H__
const int i
const int j
__syncthreads()
__global__ void const T *__restrict__ x
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void reduce_kernel(T *bufred, const int n)
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__inline__ __device__ T reduce_warp(T val)
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:76
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
Definition math_kernel.h:94
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void add4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
__global__ void masked_red_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition math_kernel.h:58
__global__ void vcross_kernel(T *__restrict__ u1, T *__restrict__ u2, T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const T *__restrict__ w1, const T *__restrict__ w2, const T *__restrict__ w3, const int n)
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
__global__ void absval_kernel(T *__restrict__ a, const int n)
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
real * bufred
Definition math.cu:479
real * buf
Definition pipecg_aux.cu:42