Neko  0.8.99
A portable framework for high-order spectral element flow simulations
math_kernel.h
Go to the documentation of this file.
1 #ifndef __MATH_MATH_KERNEL_H__
2 #define __MATH_MATH_KERNEL_H__
3 /*
4  Copyright (c) 2021-2023, The Neko Authors
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13 
14  * Redistributions in binary form must reproduce the above
15  copyright notice, this list of conditions and the following
16  disclaimer in the documentation and/or other materials provided
17  with the distribution.
18 
19  * Neither the name of the authors nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  POSSIBILITY OF SUCH DAMAGE.
35 */
36 
40 template< typename T >
41 __global__ void cmult_kernel(T * __restrict__ a,
42  const T c,
43  const int n) {
44 
45  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
46  const int str = blockDim.x * gridDim.x;
47 
48  for (int i = idx; i < n; i += str) {
49  a[i] = c * a[i];
50  }
51 }
52 
56 template< typename T >
57 __global__ void masked_copy_kernel(T * __restrict__ a,
58  T * __restrict__ b,
59  int * __restrict__ mask,
60  const int n,
61  const int m) {
62 
63  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
64  const int str = blockDim.x * gridDim.x;
65 
66  for (int i = idx; i < m; i += str) {
67  a[mask[i+1]-1] = b[mask[i+1]-1];
68  }
69 }
70 
74 template< typename T >
75 __global__ void cfill_mask_kernel(T* __restrict__ a,
76  const T c,
77  const int size,
78  int* __restrict__ mask,
79  const int mask_size) {
80 
81  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
82  const int str = blockDim.x * gridDim.x;
83 
84  for (int i = idx; i < mask_size; i += str) { a[mask[i]-1] = c; }
85 }
86 
90 template< typename T >
91 __global__ void cmult2_kernel(T * __restrict__ a,
92  T * __restrict__ b,
93  const T c,
94  const int n) {
95 
96  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
97  const int str = blockDim.x * gridDim.x;
98 
99  for (int i = idx; i < n; i += str) {
100  a[i] = c * b[i];
101  }
102 }
103 
107 template< typename T >
108 __global__ void cadd_kernel(T * __restrict__ a,
109  const T c,
110  const int n) {
111 
112  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
113  const int str = blockDim.x * gridDim.x;
114 
115  for (int i = idx; i < n; i += str) {
116  a[i] = a[i] + c;
117  }
118 }
119 
123 template< typename T >
124 __global__ void cadd2_kernel(T * __restrict__ a,
125  T * __restrict__ b,
126  const T c,
127  const int n) {
128 
129  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
130  const int str = blockDim.x * gridDim.x;
131 
132  for (int i = idx; i < n; i += str) {
133  a[i] = b[i] + c;
134  }
135 }
136 
140 template< typename T >
141 __global__ void cfill_kernel(T * __restrict__ a,
142  const T c,
143  const int n) {
144 
145  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
146  const int str = blockDim.x * gridDim.x;
147 
148  for (int i = idx; i < n; i += str) {
149  a[i] = c;
150  }
151 }
152 
156 template< typename T >
157 __global__ void add2_kernel(T * __restrict__ a,
158  const T * __restrict__ b,
159  const int n) {
160 
161  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
162  const int str = blockDim.x * gridDim.x;
163 
164  for (int i = idx; i < n; i += str) {
165  a[i] = a[i] + b[i];
166  }
167 }
168 
172 template< typename T >
173 __global__ void add3_kernel(T * __restrict__ a,
174  const T * __restrict__ b,
175  const T * __restrict__ c,
176  const int n) {
177 
178  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
179  const int str = blockDim.x * gridDim.x;
180 
181  for (int i = idx; i < n; i += str) {
182  a[i] = b[i] + c[i];
183  }
184 }
185 
189 template< typename T >
190 __global__ void add2s1_kernel(T * __restrict__ a,
191  const T * __restrict__ b,
192  const T c1,
193  const int n) {
194 
195  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
196  const int str = blockDim.x * gridDim.x;
197 
198  for (int i = idx; i < n; i += str) {
199  a[i] = c1 * a[i] + b[i];
200  }
201 }
202 
206 template< typename T >
207 __global__ void add2s2_many_kernel(T * __restrict__ x,
208  const T ** p,
209  const T * alpha,
210  const int p_cur,
211  const int n) {
212 
213  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
214  const int str = blockDim.x * gridDim.x;
215 
216 
217  for (int i = idx; i < n; i+= str) {
218  T tmp = 0.0;
219  for (int j = 0; j < p_cur; j ++) {
220  tmp += p[j][i]*alpha[j];
221  }
222  x[i] += tmp;
223  }
224 }
225 
229 template< typename T >
230 __global__ void add2s2_kernel(T * __restrict__ a,
231  const T * __restrict__ b,
232  const T c1,
233  const int n) {
234 
235  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
236  const int str = blockDim.x * gridDim.x;
237 
238  for (int i = idx; i < n; i += str) {
239  a[i] = a[i] + c1 * b[i];
240  }
241 }
242 
246 template< typename T >
247 __global__ void addsqr2s2_kernel(T * __restrict__ a,
248  const T * __restrict__ b,
249  const T c1,
250  const int n) {
251 
252  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
253  const int str = blockDim.x * gridDim.x;
254 
255  for (int i = idx; i < n; i += str) {
256  a[i] = a[i] + c1 * (b[i] * b[i]);
257  }
258 }
259 
263 template< typename T >
264 __global__ void add3s2_kernel(T * __restrict__ a,
265  const T * __restrict__ b,
266  const T * __restrict__ c,
267  const T c1,
268  const T c2,
269  const int n) {
270 
271  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
272  const int str = blockDim.x * gridDim.x;
273 
274  for (int i = idx; i < n; i += str) {
275  a[i] = c1 * b[i] + c2 * c[i];
276  }
277 }
278 
282 template< typename T >
283 __global__ void invcol1_kernel(T * __restrict__ a,
284  const int n) {
285 
286  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
287  const int str = blockDim.x * gridDim.x;
288  const T one = 1.0;
289 
290  for (int i = idx; i < n; i += str) {
291  a[i] = one / a[i];
292  }
293 }
294 
298 template< typename T >
299 __global__ void invcol2_kernel(T * __restrict__ a,
300  const T * __restrict__ b,
301  const int n) {
302 
303  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
304  const int str = blockDim.x * gridDim.x;
305 
306  for (int i = idx; i < n; i += str) {
307  a[i] = a[i] / b[i];
308  }
309 }
310 
314 template< typename T >
315 __global__ void col2_kernel(T * __restrict__ a,
316  const T * __restrict__ b,
317  const int n) {
318 
319  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
320  const int str = blockDim.x * gridDim.x;
321 
322  for (int i = idx; i < n; i += str) {
323  a[i] = a[i] * b[i];
324  }
325 }
326 
330 template< typename T >
331 __global__ void col3_kernel(T * __restrict__ a,
332  const T * __restrict__ b,
333  const T * __restrict__ c,
334  const int n) {
335 
336  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
337  const int str = blockDim.x * gridDim.x;
338 
339  for (int i = idx; i < n; i += str) {
340  a[i] = b[i] * c[i];
341  }
342 }
343 
347 template< typename T >
348 __global__ void subcol3_kernel(T * __restrict__ a,
349  const T * __restrict__ b,
350  const T * __restrict__ c,
351  const int n) {
352 
353  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
354  const int str = blockDim.x * gridDim.x;
355 
356  for (int i = idx; i < n; i += str) {
357  a[i] = a[i] - b[i] * c[i];
358  }
359 }
360 
364 template< typename T >
365 __global__ void sub2_kernel(T * __restrict__ a,
366  const T * __restrict__ b,
367  const int n) {
368 
369  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
370  const int str = blockDim.x * gridDim.x;
371 
372  for (int i = idx; i < n; i += str) {
373  a[i] = a[i] - b[i];
374  }
375 }
376 
380 template< typename T >
381 __global__ void sub3_kernel(T * __restrict__ a,
382  const T * __restrict__ b,
383  const T * __restrict__ c,
384  const int n) {
385 
386  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
387  const int str = blockDim.x * gridDim.x;
388 
389  for (int i = idx; i < n; i += str) {
390  a[i] = b[i] - c[i];
391  }
392 }
393 
397 template< typename T >
398 __global__ void addcol3_kernel(T * __restrict__ a,
399  const T * __restrict__ b,
400  const T * __restrict__ c,
401  const int n) {
402 
403  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
404  const int str = blockDim.x * gridDim.x;
405 
406  for (int i = idx; i < n; i += str) {
407  a[i] = a[i] + b[i] * c[i];
408  }
409 
410 }
411 
415 template< typename T >
416 __global__ void addcol4_kernel(T * __restrict__ a,
417  const T * __restrict__ b,
418  const T * __restrict__ c,
419  const T * __restrict__ d,
420  const int n) {
421 
422  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
423  const int str = blockDim.x * gridDim.x;
424 
425  for (int i = idx; i < n; i += str) {
426  a[i] = a[i] + b[i] * c[i] * d[i];
427  }
428 
429 }
430 
434 template< typename T >
435 __global__ void vdot3_kernel(T * __restrict__ dot,
436  const T * __restrict__ u1,
437  const T * __restrict__ u2,
438  const T * __restrict__ u3,
439  const T * __restrict__ v1,
440  const T * __restrict__ v2,
441  const T * __restrict__ v3,
442  const int n) {
443 
444  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
445  const int str = blockDim.x * gridDim.x;
446 
447  for (int i = idx; i < n; i += str) {
448  dot[i] = u1[i] * v1[i] + u2[i] * v2[i] + u3[i] * v3[i];
449  }
450 
451 }
452 
456 template< typename T>
457 __inline__ __device__ T reduce_warp(T val) {
458  val += __shfl_down(val, 32);
459  val += __shfl_down(val, 16);
460  val += __shfl_down(val, 8);
461  val += __shfl_down(val, 4);
462  val += __shfl_down(val, 2);
463  val += __shfl_down(val, 1);
464  return val;
465 }
466 
470 template< typename T >
471 __global__ void reduce_kernel(T * bufred, const int n) {
472 
473  T sum = 0;
474  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
475  const int str = blockDim.x * gridDim.x;
476  for (int i = idx; i<n ; i += str)
477  {
478  sum += bufred[i];
479  }
480 
481  __shared__ T shared[64];
482  unsigned int lane = threadIdx.x % warpSize;
483  unsigned int wid = threadIdx.x / warpSize;
484 
485  sum = reduce_warp<T>(sum);
486  if (lane == 0)
487  shared[wid] = sum;
488  __syncthreads();
489 
490  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
491  if (wid == 0)
492  sum = reduce_warp<T>(sum);
493 
494  if (threadIdx.x == 0)
495  bufred[blockIdx.x] = sum;
496 }
497 
502 template< typename T >
503 __global__ void glsc3_reduce_kernel( T * bufred,
504  const int n,
505  const int j
506  ) {
507  __shared__ T buf[1024] ;
508  const int idx = threadIdx.x;
509  const int y= blockIdx.x;
510  const int step = blockDim.x;
511 
512  buf[idx]=0;
513  for (int i=idx ; i<n ; i+=step)
514  {
515  buf[idx] += bufred[i*j + y];
516  }
517  __syncthreads();
518 
519  int i = 512;
520  while (i != 0)
521  {
522  if(threadIdx.x < i && (threadIdx.x + i) < n )
523  {
524  buf[threadIdx.x] += buf[threadIdx.x + i] ;
525  }
526  i = i>>1;
527  __syncthreads();
528  }
529 
530  bufred[y] = buf[0];
531 }
532 
536 template< typename T >
537 __global__ void glsc3_kernel(const T * a,
538  const T * b,
539  const T * c,
540  T * buf_h,
541  const int n) {
542 
543  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
544  const int str = blockDim.x * gridDim.x;
545 
546  const unsigned int lane = threadIdx.x % warpSize;
547  const unsigned int wid = threadIdx.x / warpSize;
548 
549  __shared__ T shared[64];
550  T sum = 0.0;
551  for (int i = idx; i < n; i+= str) {
552  sum += a[i] * b[i] * c[i];
553  }
554 
555  sum = reduce_warp<T>(sum);
556  if (lane == 0)
557  shared[wid] = sum;
558  __syncthreads();
559 
560  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
561  if (wid == 0)
562  sum = reduce_warp<T>(sum);
563 
564  if (threadIdx.x == 0)
565  buf_h[blockIdx.x] = sum;
566 }
567 
571 template< typename T >
572 __global__ void glsc3_many_kernel(const T * a,
573  const T ** b,
574  const T * c,
575  T * buf_h,
576  const int j,
577  const int n) {
578 
579  const int idx = blockIdx.x * blockDim.y + threadIdx.y;
580  const int str = blockDim.y * gridDim.x;
581  const int y = threadIdx.x;
582 
583  __shared__ T buf[1024];
584  T tmp = 0;
585  if(y < j){
586  for (int i = idx; i < n; i+= str) {
587  tmp += a[i] * b[threadIdx.x][i] * c[i];
588  }
589  }
590 
591  buf[threadIdx.y*blockDim.x+y] = tmp;
592  __syncthreads();
593 
594  int i = blockDim.y>>1;
595  while (i != 0) {
596  if (threadIdx.y < i) {
597  buf[threadIdx.y*blockDim.x +y] += buf[(threadIdx.y + i)*blockDim.x+y];
598  }
599  __syncthreads();
600  i = i>>1;
601  }
602  if (threadIdx.y == 0) {
603  if( y < j) {
604  buf_h[j*blockIdx.x+y] = buf[y];
605  }
606  }
607 }
608 
609 
613 template< typename T >
614 __global__ void glsc2_kernel(const T * a,
615  const T * b,
616  T * buf_h,
617  const int n) {
618 
619  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
620  const int str = blockDim.x * gridDim.x;
621 
622  const unsigned int lane = threadIdx.x % warpSize;
623  const unsigned int wid = threadIdx.x / warpSize;
624 
625  __shared__ T shared[64];
626  T sum = 0.0;
627  for (int i = idx; i < n; i+= str) {
628  sum += a[i] * b[i];
629  }
630 
631  sum = reduce_warp<T>(sum);
632  if (lane == 0)
633  shared[wid] = sum;
634  __syncthreads();
635 
636  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
637  if (wid == 0)
638  sum = reduce_warp<T>(sum);
639 
640  if (threadIdx.x == 0)
641  buf_h[blockIdx.x] = sum;
642 }
643 
647 template< typename T >
648 __global__ void glsum_kernel(const T * a,
649  T * buf_h,
650  const int n) {
651 
652  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
653  const int str = blockDim.x * gridDim.x;
654 
655  const unsigned int lane = threadIdx.x % warpSize;
656  const unsigned int wid = threadIdx.x / warpSize;
657 
658  __shared__ T shared[64];
659  T sum = 0;
660  for (int i = idx; i<n ; i += str)
661  {
662  sum += a[i];
663  }
664 
665  sum = reduce_warp<T>(sum);
666  if (lane == 0)
667  shared[wid] = sum;
668  __syncthreads();
669 
670  sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
671  if (wid == 0)
672  sum = reduce_warp<T>(sum);
673 
674  if (threadIdx.x == 0)
675  buf_h[blockIdx.x] = sum;
676 
677 }
678 
679 #endif // __MATH_MATH_KERNEL_H__
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
const int i
Definition: cdtp_kernel.h:128
const int j
Definition: cdtp_kernel.h:127
__syncthreads()
__global__ void addcol4_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const int n)
Definition: math_kernel.h:416
__global__ void reduce_kernel(T *bufred, const int n)
Definition: math_kernel.h:470
__global__ void invcol2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:299
__global__ void add2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:157
__inline__ __device__ T reduce_warp(T val)
Definition: math_kernel.h:457
__global__ void masked_copy_kernel(T *__restrict__ a, T *__restrict__ b, int *__restrict__ mask, const int n, const int m)
Definition: math_kernel.h:57
__global__ void glsc3_many_kernel(const T *a, const T **b, const T *c, T *buf_h, const int j, const int n)
Definition: math_kernel.h:572
__global__ void cfill_mask_kernel(T *__restrict__ a, const T c, const int size, int *__restrict__ mask, const int mask_size)
Definition: math_kernel.h:75
__global__ void glsc3_reduce_kernel(T *bufred, const int n, const int j)
Definition: math_kernel.h:502
__global__ void add3s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T c1, const T c2, const int n)
Definition: math_kernel.h:264
__global__ void add2s1_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:190
__global__ void add2s2_many_kernel(T *__restrict__ x, const T **p, const T *alpha, const int p_cur, const int n)
Definition: math_kernel.h:207
__global__ void cmult_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:41
__global__ void addcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:398
__global__ void col2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:315
__global__ void col3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:331
__global__ void sub2_kernel(T *__restrict__ a, const T *__restrict__ b, const int n)
Definition: math_kernel.h:365
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition: math_kernel.h:613
__global__ void cmult2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:91
__global__ void sub3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:381
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition: math_kernel.h:648
__global__ void glsc3_kernel(const T *a, const T *b, const T *c, T *buf_h, const int n)
Definition: math_kernel.h:537
__global__ void add2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:230
__global__ void vdot3_kernel(T *__restrict__ dot, const T *__restrict__ u1, const T *__restrict__ u2, const T *__restrict__ u3, const T *__restrict__ v1, const T *__restrict__ v2, const T *__restrict__ v3, const int n)
Definition: math_kernel.h:435
__global__ void invcol1_kernel(T *__restrict__ a, const int n)
Definition: math_kernel.h:283
__global__ void add3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:173
__global__ void cfill_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:141
__global__ void addsqr2s2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c1, const int n)
Definition: math_kernel.h:247
__global__ void cadd2_kernel(T *__restrict__ a, T *__restrict__ b, const T c, const int n)
Definition: math_kernel.h:124
__global__ void subcol3_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const int n)
Definition: math_kernel.h:348
__global__ void cadd_kernel(T *__restrict__ a, const T c, const int n)
Definition: math_kernel.h:108
real * bufred
Definition: math.cu:427
real * buf
Definition: pipecg_aux.cu:42