Neko  0.8.1
A portable framework for high-order spectral element flow simulations
math_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T >
41 __global__ void cmult_kernel(T * __restrict__ a,
42  const T c,
43  const int n) {
44 
45  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46  const int str = blockDim.x * gridDim.x;
47 
48  for (int i = idx; i < n; i += str) {
49  a[i] = c * a[i];
50  }
51 }
52 
56 template< typename T >
57 __global__ void masked_copy_kernel(T * __restrict__ a,
58  T * __restrict__ b,
59  int * __restrict__ mask,
60  const int n,
61  const int m) {
62 
63  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64  const int str = blockDim.x * gridDim.x;
65 
66  for (int i = idx; i < m; i += str) {
67  a[mask[i+1]-1] = b[mask[i+1]-1];
68  }
69 }
70 
71 
75 template< typename T >
76 __global__ void cmult2_kernel(T * __restrict__ a,
77  T * __restrict__ b,
78  const T c,
79  const int n) {
80 
81  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82  const int str = blockDim.x * gridDim.x;
83 
84  for (int i = idx; i < n; i += str) {
85  a[i] = c * b[i];
86  }
87 }
88 
92 template< typename T >
93 __global__ void cadd_kernel(T * __restrict__ a,
94  const T c,
95  const int n) {
96 
97  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
98  const int str = blockDim.x * gridDim.x;
99 
100  for (int i = idx; i < n; i += str) {
101  a[i] = a[i] + c;
102  }
103 }
104 
108 template< typename T >
109 __global__ void cfill_kernel(T * __restrict__ a,
110  const T c,
111  const int n) {
112 
113  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
114  const int str = blockDim.x * gridDim.x;
115 
116  for (int i = idx; i < n; i += str) {
117  a[i] = c;
118  }
119 }
120 
124 template< typename T >
125 __global__ void add2_kernel(T * __restrict__ a,
126  const T * __restrict__ b,
127  const int n) {
128 
129  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
130  const int str = blockDim.x * gridDim.x;
131 
132  for (int i = idx; i < n; i += str) {
133  a[i] = a[i] + b[i];
134  }
135 }
136 
140 template< typename T >
141 __global__ void add2s1_kernel(T * __restrict__ a,
142  const T * __restrict__ b,
143  const T c1,
144  const int n) {
145 
146  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
147  const int str = blockDim.x * gridDim.x;
148 
149  for (int i = idx; i < n; i += str) {
150  a[i] = c1 * a[i] + b[i];
151  }
152 }
153 
157 template< typename T >
158 __global__ void add2s2_many_kernel(T * __restrict__ x,
159  const T ** p,
160  const T * alpha,
161  const int p_cur,
162  const int n) {
163 
164  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
165  const int str = blockDim.x * gridDim.x;
166 
167 
168  for (int i = idx; i < n; i+= str) {
169  T tmp = 0.0;
170  for (int j = 0; j < p_cur; j ++) {
171  tmp += p[j][i]*alpha[j];
172  }
173  x[i] += tmp;
174  }
175 }
176 
180 template< typename T >
181 __global__ void add2s2_kernel(T * __restrict__ a,
182  const T * __restrict__ b,
183  const T c1,
184  const int n) {
185 
186  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
187  const int str = blockDim.x * gridDim.x;
188 
189  for (int i = idx; i < n; i += str) {
190  a[i] = a[i] + c1 * b[i];
191  }
192 }
193 
197 template< typename T >
198 __global__ void addsqr2s2_kernel(T * __restrict__ a,
199  const T * __restrict__ b,
200  const T c1,
201  const int n) {
202 
203  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
204  const int str = blockDim.x * gridDim.x;
205 
206  for (int i = idx; i < n; i += str) {
207  a[i] = a[i] + c1 * (b[i] * b[i]);
208  }
209 }
210 
214 template< typename T >
215 __global__ void add3s2_kernel(T * __restrict__ a,
216  const T * __restrict__ b,
217  const T * __restrict__ c,
218  const T c1,
219  const T c2,
220  const int n) {
221 
222  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
223  const int str = blockDim.x * gridDim.x;
224 
225  for (int i = idx; i < n; i += str) {
226  a[i] = c1 * b[i] + c2 * c[i];
227  }
228 }
229 
233 template< typename T >
234 __global__ void invcol1_kernel(T * __restrict__ a,
235  const int n) {
236 
237  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
238  const int str = blockDim.x * gridDim.x;
239  const T one = 1.0;
240 
241  for (int i = idx; i < n; i += str) {
242  a[i] = one / a[i];
243  }
244 }
245 
249 template< typename T >
250 __global__ void invcol2_kernel(T * __restrict__ a,
251  const T * __restrict__ b,
252  const int n) {
253 
254  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
255  const int str = blockDim.x * gridDim.x;
256 
257  for (int i = idx; i < n; i += str) {
258  a[i] = a[i] / b[i];
259  }
260 }
261 
265 template< typename T >
266 __global__ void col2_kernel(T * __restrict__ a,
267  const T * __restrict__ b,
268  const int n) {
269 
270  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
271  const int str = blockDim.x * gridDim.x;
272 
273  for (int i = idx; i < n; i += str) {
274  a[i] = a[i] * b[i];
275  }
276 }
277 
281 template< typename T >
282 __global__ void col3_kernel(T * __restrict__ a,
283  const T * __restrict__ b,
284  const T * __restrict__ c,
285  const int n) {
286 
287  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
288  const int str = blockDim.x * gridDim.x;
289 
290  for (int i = idx; i < n; i += str) {
291  a[i] = b[i] * c[i];
292  }
293 }
294 
298 template< typename T >
299 __global__ void subcol3_kernel(T * __restrict__ a,
300  const T * __restrict__ b,
301  const T * __restrict__ c,
302  const int n) {
303 
304  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
305  const int str = blockDim.x * gridDim.x;
306 
307  for (int i = idx; i < n; i += str) {
308  a[i] = a[i] - b[i] * c[i];
309  }
310 }
311 
315 template< typename T >
316 __global__ void sub2_kernel(T * __restrict__ a,
317  const T * __restrict__ b,
318  const int n) {
319 
320  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
321  const int str = blockDim.x * gridDim.x;
322 
323  for (int i = idx; i < n; i += str) {
324  a[i] = a[i] - b[i];
325  }
326 }
327 
331 template< typename T >
332 __global__ void sub3_kernel(T * __restrict__ a,
333  const T * __restrict__ b,
334  const T * __restrict__ c,
335  const int n) {
336 
337  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
338  const int str = blockDim.x * gridDim.x;
339 
340  for (int i = idx; i < n; i += str) {
341  a[i] = b[i] - c[i];
342  }
343 }
344 
348 template< typename T >
349 __global__ void addcol3_kernel(T * __restrict__ a,
350  const T * __restrict__ b,
351  const T * __restrict__ c,
352  const int n) {
353 
354  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
355  const int str = blockDim.x * gridDim.x;
356 
357  for (int i = idx; i < n; i += str) {
358  a[i] = a[i] + b[i] * c[i];
359  }
360 
361 }
362 
366 template< typename T >
367 __global__ void addcol4_kernel(T * __restrict__ a,
368  const T * __restrict__ b,
369  const T * __restrict__ c,
370  const T * __restrict__ d,
371  const int n) {
372 
373  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
374  const int str = blockDim.x * gridDim.x;
375 
376  for (int i = idx; i < n; i += str) {
377  a[i] = a[i] + b[i] * c[i] * d[i];
378  }
379 
380 }
381 
385 template< typename T >
386 __global__ void vdot3_kernel(T * __restrict__ dot,
387  const T * __restrict__ u1,
388  const T * __restrict__ u2,
389  const T * __restrict__ u3,
390  const T * __restrict__ v1,
391  const T * __restrict__ v2,
392  const T * __restrict__ v3,
393  const int n) {
394 
395  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396  const int str = blockDim.x * gridDim.x;
397 
398  for (int i = idx; i < n; i += str) {
399  dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
400  }
401 
402 }
403 
407 template< typename T>
408 __inline__ __device__ T reduce_warp(T val) {
409  val += __shfl_down_sync(0xffffffff, val, 16);
410  val += __shfl_down_sync(0xffffffff, val, 8);
411  val += __shfl_down_sync(0xffffffff, val, 4);
412  val += __shfl_down_sync(0xffffffff, val, 2);
413  val += __shfl_down_sync(0xffffffff, val, 1);
414  return val;
415 }
416 
420 template< typename T >
421 __global__ void reduce_kernel(T * bufred, const int n) {
422 
423  T sum = 0;
424  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425  const int str = blockDim.x * gridDim.x;
426  for (int i = idx; i<n ; i += str)
427  {
428  sum += bufred[i];
429  }
430 
431  __shared__ T shared[32];
432  unsigned int lane = threadIdx.x % warpSize;
433  unsigned int wid = threadIdx.x / warpSize;
434 
435  sum = reduce_warp<T>(sum);
436  if (lane == 0)
437  shared[wid] = sum;
438  __syncthreads();
439 
440  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
441  if (wid == 0)
442  sum = reduce_warp<T>(sum);
443 
444  if (threadIdx.x == 0)
445  bufred[blockIdx.x] = sum;
446 }
447 
452 template< typename T >
453 __global__ void glsc3_reduce_kernel( T * bufred,
454  const int n,
455  const int j
456  ) {
457  __shared__ T buf[1024] ;
458  const int idx = threadIdx.x;
459  const int y= blockIdx.x;
460  const int step = blockDim.x;
461 
462  buf[idx]=0;
463  for (int i=idx ; i<n ; i+=step)
464  {
465  buf[idx] += bufred[i*j + y];
466  }
467  __syncthreads();
468 
469  int i = 512;
470  while (i != 0)
471  {
472  if(threadIdx.x < i && (threadIdx.x + i) < n )
473  {
474  buf[threadIdx.x] += buf[threadIdx.x + i] ;
475  }
476  i = i>>1;
477  __syncthreads();
478  }
479 
480  bufred[y] = buf[0];
481 }
482 
483 
487 template< typename T >
488 __global__ void glsc3_kernel(const T * a,
489  const T * b,
490  const T * c,
491  T * buf_h,
492  const int n) {
493 
494  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
495  const int str = blockDim.x * gridDim.x;
496 
497  const unsigned int lane = threadIdx.x % warpSize;
498  const unsigned int wid = threadIdx.x / warpSize;
499 
500  __shared__ T shared[32];
501  T sum = 0.0;
502  for (int i = idx; i < n; i+= str) {
503  sum += a[i] * b[i] * c[i];
504  }
505 
506  sum = reduce_warp<T>(sum);
507  if (lane == 0)
508  shared[wid] = sum;
509  __syncthreads();
510 
511  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
512  if (wid == 0)
513  sum = reduce_warp<T>(sum);
514 
515  if (threadIdx.x == 0)
516  buf_h[blockIdx.x] = sum;
517 }
518 
522 template< typename T >
523 __global__ void glsc3_many_kernel(const T * a,
524  const T ** b,
525  const T * c,
526  T * buf_h,
527  const int j,
528  const int n) {
529 
530  const int idx = blockIdx.x * blockDim.y + threadIdx.y;
531  const int str = blockDim.y * gridDim.x;
532  const int y = threadIdx.x;
533 
534  __shared__ T buf[1024];
535  T tmp = 0;
536  if(y < j){
537  for (int i = idx; i < n; i+= str) {
538  tmp += a[i] * b[threadIdx.x][i] * c[i];
539  }
540  }
541 
542  buf[threadIdx.y*blockDim.x+y] = tmp;
543  __syncthreads();
544 
545  int i = blockDim.y>>1;
546  while (i != 0) {
547  if (threadIdx.y < i) {
548  buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
549  }
550  __syncthreads();
551  i = i>>1;
552  }
553  if (threadIdx.y == 0) {
554  if( y < j) {
555  buf_h[j*blockIdx.x+y] = buf[y];
556  }
557  }
558 }
559 
563 template< typename T >
564 __global__ void glsc2_kernel(const T * a,
565  const T * b,
566  T * buf_h,
567  const int n) {
568 
569  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
570  const int str = blockDim.x * gridDim.x;
571 
572  const unsigned int lane = threadIdx.x % warpSize;
573  const unsigned int wid = threadIdx.x / warpSize;
574 
575  __shared__ T shared[32];
576  T sum = 0.0;
577  for (int i = idx; i < n; i+= str) {
578  sum += a[i] * b[i];
579  }
580 
581  sum = reduce_warp<T>(sum);
582  if (lane == 0)
583  shared[wid] = sum;
584  __syncthreads();
585 
586  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
587  if (wid == 0)
588  sum = reduce_warp<T>(sum);
589 
590  if (threadIdx.x == 0)
591  buf_h[blockIdx.x] = sum;
592 
593 }
594 
598 template< typename T >
599 __global__ void glsum_kernel(const T * a,
600  T * buf_h,
601  const int n) {
602 
603  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
604  const int str = blockDim.x * gridDim.x;
605 
606  const unsigned int lane = threadIdx.x % warpSize;
607  const unsigned int wid = threadIdx.x / warpSize;
608 
609  __shared__ T shared[32];
610  T sum = 0;
611  for (int i = idx; i<n ; i += str)
612  {
613  sum += a[i];
614  }
615 
616  sum = reduce_warp<T>(sum);
617  if (lane == 0)
618  shared[wid] = sum;
619  __syncthreads();
620 
621  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
622  if (wid == 0)
623  sum = reduce_warp<T>(sum);
624 
625  if (threadIdx.x == 0)
626  buf_h[blockIdx.x] = sum;
627 
628 }
629 
630 #endif // __MATH_MATH_KERNEL_H__
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:109
const int i
Definition: cdtp_kernel.h:132
const int j
Definition: cdtp_kernel.h:131
__syncthreads()
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:367
__global__ void reduce_kernel(T *bufred, const int n)
Definition: math_kernel.h:421
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:250
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:125
__inline__ __device__ T reduce_warp(T val)
Definition: math_kernel.h:408
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:57
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
Definition: math_kernel.h:523
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
Definition: math_kernel.h:453
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
Definition: math_kernel.h:215
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:141
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
Definition: math_kernel.h:158
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:349
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:266
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:282
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:316
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition: math_kernel.h:564
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:76
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:332
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition: math_kernel.h:599
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
Definition: math_kernel.h:488
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:181
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
Definition: math_kernel.h:386
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:234
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:109
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:198
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:299
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:93
real * bufred
Definition: math.cu:386
real * buf
Definition: pipecg_aux.cu:42