Neko  0.8.1
A portable framework for high-order spectral element flow simulations
math_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T >
41 __global__ void cmult_kernel(T * __restrict__ a,
42  const T c,
43  const int n) {
44 
45  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46  const int str = blockDim.x * gridDim.x;
47 
48  for (int i = idx; i < n; i += str) {
49  a[i] = c * a[i];
50  }
51 }
52 
56 template< typename T >
57 __global__ void masked_copy_kernel(T * __restrict__ a,
58  T * __restrict__ b,
59  int * __restrict__ mask,
60  const int n,
61  const int m) {
62 
63  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64  const int str = blockDim.x * gridDim.x;
65 
66  for (int i = idx; i < m; i += str) {
67  a[mask[i+1]-1] = b[mask[i+1]-1];
68  }
69 }
70 
74 template< typename T >
75 __global__ void cmult2_kernel(T * __restrict__ a,
76  T * __restrict__ b,
77  const T c,
78  const int n) {
79 
80  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
81  const int str = blockDim.x * gridDim.x;
82 
83  for (int i = idx; i < n; i += str) {
84  a[i] = c * b[i];
85  }
86 }
87 
91 template< typename T >
92 __global__ void cadd_kernel(T * __restrict__ a,
93  const T c,
94  const int n) {
95 
96  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
97  const int str = blockDim.x * gridDim.x;
98 
99  for (int i = idx; i < n; i += str) {
100  a[i] = a[i] + c;
101  }
102 }
103 
107 template< typename T >
108 __global__ void cfill_kernel(T * __restrict__ a,
109  const T c,
110  const int n) {
111 
112  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
113  const int str = blockDim.x * gridDim.x;
114 
115  for (int i = idx; i < n; i += str) {
116  a[i] = c;
117  }
118 }
119 
123 template< typename T >
124 __global__ void add2_kernel(T * __restrict__ a,
125  const T * __restrict__ b,
126  const int n) {
127 
128  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
129  const int str = blockDim.x * gridDim.x;
130 
131  for (int i = idx; i < n; i += str) {
132  a[i] = a[i] + b[i];
133  }
134 }
135 
139 template< typename T >
140 __global__ void add2s1_kernel(T * __restrict__ a,
141  const T * __restrict__ b,
142  const T c1,
143  const int n) {
144 
145  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
146  const int str = blockDim.x * gridDim.x;
147 
148  for (int i = idx; i < n; i += str) {
149  a[i] = c1 * a[i] + b[i];
150  }
151 }
152 
156 template< typename T >
157 __global__ void add2s2_many_kernel(T * __restrict__ x,
158  const T ** p,
159  const T * alpha,
160  const int p_cur,
161  const int n) {
162 
163  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
164  const int str = blockDim.x * gridDim.x;
165 
166 
167  for (int i = idx; i < n; i+= str) {
168  T tmp = 0.0;
169  for (int j = 0; j < p_cur; j ++) {
170  tmp += p[j][i]*alpha[j];
171  }
172  x[i] += tmp;
173  }
174 }
175 
179 template< typename T >
180 __global__ void add2s2_kernel(T * __restrict__ a,
181  const T * __restrict__ b,
182  const T c1,
183  const int n) {
184 
185  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
186  const int str = blockDim.x * gridDim.x;
187 
188  for (int i = idx; i < n; i += str) {
189  a[i] = a[i] + c1 * b[i];
190  }
191 }
192 
196 template< typename T >
197 __global__ void addsqr2s2_kernel(T * __restrict__ a,
198  const T * __restrict__ b,
199  const T c1,
200  const int n) {
201 
202  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
203  const int str = blockDim.x * gridDim.x;
204 
205  for (int i = idx; i < n; i += str) {
206  a[i] = a[i] + c1 * (b[i] * b[i]);
207  }
208 }
209 
213 template< typename T >
214 __global__ void add3s2_kernel(T * __restrict__ a,
215  const T * __restrict__ b,
216  const T * __restrict__ c,
217  const T c1,
218  const T c2,
219  const int n) {
220 
221  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
222  const int str = blockDim.x * gridDim.x;
223 
224  for (int i = idx; i < n; i += str) {
225  a[i] = c1 * b[i] + c2 * c[i];
226  }
227 }
228 
232 template< typename T >
233 __global__ void invcol1_kernel(T * __restrict__ a,
234  const int n) {
235 
236  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
237  const int str = blockDim.x * gridDim.x;
238  const T one = 1.0;
239 
240  for (int i = idx; i < n; i += str) {
241  a[i] = one / a[i];
242  }
243 }
244 
248 template< typename T >
249 __global__ void invcol2_kernel(T * __restrict__ a,
250  const T * __restrict__ b,
251  const int n) {
252 
253  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
254  const int str = blockDim.x * gridDim.x;
255 
256  for (int i = idx; i < n; i += str) {
257  a[i] = a[i] / b[i];
258  }
259 }
260 
264 template< typename T >
265 __global__ void col2_kernel(T * __restrict__ a,
266  const T * __restrict__ b,
267  const int n) {
268 
269  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
270  const int str = blockDim.x * gridDim.x;
271 
272  for (int i = idx; i < n; i += str) {
273  a[i] = a[i] * b[i];
274  }
275 }
276 
280 template< typename T >
281 __global__ void col3_kernel(T * __restrict__ a,
282  const T * __restrict__ b,
283  const T * __restrict__ c,
284  const int n) {
285 
286  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
287  const int str = blockDim.x * gridDim.x;
288 
289  for (int i = idx; i < n; i += str) {
290  a[i] = b[i] * c[i];
291  }
292 }
293 
297 template< typename T >
298 __global__ void subcol3_kernel(T * __restrict__ a,
299  const T * __restrict__ b,
300  const T * __restrict__ c,
301  const int n) {
302 
303  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
304  const int str = blockDim.x * gridDim.x;
305 
306  for (int i = idx; i < n; i += str) {
307  a[i] = a[i] - b[i] * c[i];
308  }
309 }
310 
314 template< typename T >
315 __global__ void sub2_kernel(T * __restrict__ a,
316  const T * __restrict__ b,
317  const int n) {
318 
319  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320  const int str = blockDim.x * gridDim.x;
321 
322  for (int i = idx; i < n; i += str) {
323  a[i] = a[i] - b[i];
324  }
325 }
326 
330 template< typename T >
331 __global__ void sub3_kernel(T * __restrict__ a,
332  const T * __restrict__ b,
333  const T * __restrict__ c,
334  const int n) {
335 
336  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
337  const int str = blockDim.x * gridDim.x;
338 
339  for (int i = idx; i < n; i += str) {
340  a[i] = b[i] - c[i];
341  }
342 }
343 
347 template< typename T >
348 __global__ void addcol3_kernel(T * __restrict__ a,
349  const T * __restrict__ b,
350  const T * __restrict__ c,
351  const int n) {
352 
353  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
354  const int str = blockDim.x * gridDim.x;
355 
356  for (int i = idx; i < n; i += str) {
357  a[i] = a[i] + b[i] * c[i];
358  }
359 
360 }
361 
365 template< typename T >
366 __global__ void addcol4_kernel(T * __restrict__ a,
367  const T * __restrict__ b,
368  const T * __restrict__ c,
369  const T * __restrict__ d,
370  const int n) {
371 
372  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
373  const int str = blockDim.x * gridDim.x;
374 
375  for (int i = idx; i < n; i += str) {
376  a[i] = a[i] + b[i] * c[i] * d[i];
377  }
378 
379 }
380 
384 template< typename T >
385 __global__ void vdot3_kernel(T * __restrict__ dot,
386  const T * __restrict__ u1,
387  const T * __restrict__ u2,
388  const T * __restrict__ u3,
389  const T * __restrict__ v1,
390  const T * __restrict__ v2,
391  const T * __restrict__ v3,
392  const int n) {
393 
394  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
395  const int str = blockDim.x * gridDim.x;
396 
397  for (int i = idx; i < n; i += str) {
398  dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
399  }
400 
401 }
402 
406 template< typename T>
407 __inline__ __device__ T reduce_warp(T val) {
408  val += __shfl_down(val, 32);
409  val += __shfl_down(val, 16);
410  val += __shfl_down(val, 8);
411  val += __shfl_down(val, 4);
412  val += __shfl_down(val, 2);
413  val += __shfl_down(val, 1);
414  return val;
415 }
416 
420 template< typename T >
421 __global__ void reduce_kernel(T * bufred, const int n) {
422 
423  T sum = 0;
424  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
425  const int str = blockDim.x * gridDim.x;
426  for (int i = idx; i<n ; i += str)
427  {
428  sum += bufred[i];
429  }
430 
431  __shared__ T shared[64];
432  unsigned int lane = threadIdx.x % warpSize;
433  unsigned int wid = threadIdx.x / warpSize;
434 
435  sum = reduce_warp<T>(sum);
436  if (lane == 0)
437  shared[wid] = sum;
438  __syncthreads();
439 
440  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
441  if (wid == 0)
442  sum = reduce_warp<T>(sum);
443 
444  if (threadIdx.x == 0)
445  bufred[blockIdx.x] = sum;
446 }
447 
452 template< typename T >
453 __global__ void glsc3_reduce_kernel( T * bufred,
454  const int n,
455  const int j
456  ) {
457  __shared__ T buf[1024] ;
458  const int idx = threadIdx.x;
459  const int y= blockIdx.x;
460  const int step = blockDim.x;
461 
462  buf[idx]=0;
463  for (int i=idx ; i<n ; i+=step)
464  {
465  buf[idx] += bufred[i*j + y];
466  }
467  __syncthreads();
468 
469  int i = 512;
470  while (i != 0)
471  {
472  if(threadIdx.x < i && (threadIdx.x + i) < n )
473  {
474  buf[threadIdx.x] += buf[threadIdx.x + i] ;
475  }
476  i = i>>1;
477  __syncthreads();
478  }
479 
480  bufred[y] = buf[0];
481 }
482 
486 template< typename T >
487 __global__ void glsc3_kernel(const T * a,
488  const T * b,
489  const T * c,
490  T * buf_h,
491  const int n) {
492 
493  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
494  const int str = blockDim.x * gridDim.x;
495 
496  const unsigned int lane = threadIdx.x % warpSize;
497  const unsigned int wid = threadIdx.x / warpSize;
498 
499  __shared__ T shared[64];
500  T sum = 0.0;
501  for (int i = idx; i < n; i+= str) {
502  sum += a[i] * b[i] * c[i];
503  }
504 
505  sum = reduce_warp<T>(sum);
506  if (lane == 0)
507  shared[wid] = sum;
508  __syncthreads();
509 
510  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
511  if (wid == 0)
512  sum = reduce_warp<T>(sum);
513 
514  if (threadIdx.x == 0)
515  buf_h[blockIdx.x] = sum;
516 }
517 
521 template< typename T >
522 __global__ void glsc3_many_kernel(const T * a,
523  const T ** b,
524  const T * c,
525  T * buf_h,
526  const int j,
527  const int n) {
528 
529  const int idx = blockIdx.x * blockDim.y + threadIdx.y;
530  const int str = blockDim.y * gridDim.x;
531  const int y = threadIdx.x;
532 
533  __shared__ T buf[1024];
534  T tmp = 0;
535  if(y < j){
536  for (int i = idx; i < n; i+= str) {
537  tmp += a[i] * b[threadIdx.x][i] * c[i];
538  }
539  }
540 
541  buf[threadIdx.y*blockDim.x+y] = tmp;
542  __syncthreads();
543 
544  int i = blockDim.y>>1;
545  while (i != 0) {
546  if (threadIdx.y < i) {
547  buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
548  }
549  __syncthreads();
550  i = i>>1;
551  }
552  if (threadIdx.y == 0) {
553  if( y < j) {
554  buf_h[j*blockIdx.x+y] = buf[y];
555  }
556  }
557 }
558 
559 
563 template< typename T >
564 __global__ void glsc2_kernel(const T * a,
565  const T * b,
566  T * buf_h,
567  const int n) {
568 
569  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
570  const int str = blockDim.x * gridDim.x;
571 
572  const unsigned int lane = threadIdx.x % warpSize;
573  const unsigned int wid = threadIdx.x / warpSize;
574 
575  __shared__ T shared[64];
576  T sum = 0.0;
577  for (int i = idx; i < n; i+= str) {
578  sum += a[i] * b[i];
579  }
580 
581  sum = reduce_warp<T>(sum);
582  if (lane == 0)
583  shared[wid] = sum;
584  __syncthreads();
585 
586  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
587  if (wid == 0)
588  sum = reduce_warp<T>(sum);
589 
590  if (threadIdx.x == 0)
591  buf_h[blockIdx.x] = sum;
592 }
593 
597 template< typename T >
598 __global__ void glsum_kernel(const T * a,
599  T * buf_h,
600  const int n) {
601 
602  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
603  const int str = blockDim.x * gridDim.x;
604 
605  const unsigned int lane = threadIdx.x % warpSize;
606  const unsigned int wid = threadIdx.x / warpSize;
607 
608  __shared__ T shared[64];
609  T sum = 0;
610  for (int i = idx; i<n ; i += str)
611  {
612  sum += a[i];
613  }
614 
615  sum = reduce_warp<T>(sum);
616  if (lane == 0)
617  shared[wid] = sum;
618  __syncthreads();
619 
620  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
621  if (wid == 0)
622  sum = reduce_warp<T>(sum);
623 
624  if (threadIdx.x == 0)
625  buf_h[blockIdx.x] = sum;
626 
627 }
628 
629 #endif // __MATH_MATH_KERNEL_H__
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:109
const int i
Definition: cdtp_kernel.h:132
const int j
Definition: cdtp_kernel.h:131
__syncthreads()
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:367
__global__ void reduce_kernel(T *bufred, const int n)
Definition: math_kernel.h:421
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:250
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:125
__inline__ __device__ T reduce_warp(T val)
Definition: math_kernel.h:408
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:57
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
Definition: math_kernel.h:523
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
Definition: math_kernel.h:453
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
Definition: math_kernel.h:215
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:141
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
Definition: math_kernel.h:158
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:349
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:266
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:282
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:316
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition: math_kernel.h:564
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:76
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:332
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition: math_kernel.h:599
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
Definition: math_kernel.h:488
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:181
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
Definition: math_kernel.h:386
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:234
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:109
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:198
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:299
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:93
real * bufred
Definition: math.cu:386
real * buf
Definition: pipecg_aux.cu:42