Loading [MathJax]/extensions/tex2jax.js
Neko
0.9.99
A portable framework for high-order spectral element flow simulations
Toggle main menu visibility
Main Page
User Guide
Developer Guide
Modules
Modules List
Module Members
All
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
z
Functions/Subroutines
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
z
Variables
a
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
x
z
Enumerator
c
h
Data Types
Data Types List
Data Type Index
Class Hierarchy
Data Fields
All
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
Functions/Subroutines
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
Variables
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
Files
File List
File Members
All
_
a
b
c
d
e
f
g
h
i
j
l
m
n
o
p
r
s
t
u
v
w
x
z
Functions/Subroutines
_
a
c
d
e
f
g
h
i
j
l
m
n
o
p
r
s
t
v
z
Variables
a
b
c
d
e
f
g
h
i
j
l
m
n
o
p
r
s
t
u
v
w
x
z
Typedefs
Macros
_
c
d
g
h
m
n
p
s
Examples
•
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerator
Macros
Pages
Loading...
Searching...
No Matches
gs_nvshmem_kernels.h
Go to the documentation of this file.
1
/*
2
Copyright (c) 2024-2025, The Neko Authors
3
All rights reserved.
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions
7
are met:
8
9
* Redistributions of source code must retain the above copyright
10
notice, this list of conditions and the following disclaimer.
11
12
* Redistributions in binary form must reproduce the above
13
copyright notice, this list of conditions and the following
14
disclaimer in the documentation and/or other materials provided
15
with the distribution.
16
17
* Neither the name of the authors nor the names of its
18
contributors may be used to endorse or promote products derived
19
from this software without specific prior written permission.
20
21
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
POSSIBILITY OF SUCH DAMAGE.
33
*/
34
35
#ifndef __GS_NVSHMEM_KERNELS__
36
#define __GS_NVHSMEM_KERNELS__
37
38
#include <nvshmemx.h>
39
40
41
template
<
typename
T >
42
__global__
void
pack_pushShmemKernel
(
const
T
*
__restrict__
u
,
43
T
*
dest
,
44
T
*
__restrict__
src
,
45
const
int
*
__restrict__
dof,
46
const
int
destRank
,
47
const
int
srcRank
,
48
const
int
n,
49
uint64_t
counter,
50
uint64_t
*
notifyDone
,
51
uint64_t
*
notifyReady
);
52
53
template
<>
54
__global__
void
pack_pushShmemKernel
(
const
float
*
__restrict__
u
,
55
float
*
dest
,
56
float
*
__restrict__
src
,
57
const
int
*
__restrict__
dof,
58
const
int
destRank
,
59
const
int
srcRank
,
60
const
int
n,
61
uint64_t
counter,
62
uint64_t
*
notifyDone
,
63
uint64_t
*
notifyReady
)
64
{
65
66
67
const
int
j
=
threadIdx
.x +
blockDim
.x *
blockIdx
.x;
68
69
if
(
j
< n) {
70
src
[
j
] =
u
[dof[
j
]-1];
71
}
72
__syncthreads
();
73
74
//TO DO: 1 block transfers seem best from initial investigations, check this more thoroughly
75
size_t
numBlocksForTransfer
= 1;
76
if
(
blockIdx
.x <
numBlocksForTransfer
)
77
{
78
size_t
n_per_block
= n/
numBlocksForTransfer
;
79
size_t
block_offset
=
n_per_block
*
blockIdx
.x;
80
size_t
dataSize
=
blockIdx
.x != (
numBlocksForTransfer
- 1) ?
81
n_per_block
:
max
(n -
block_offset
,
n_per_block
);
82
83
// Notify ready to sending rank, and wait until recieving rank is ready
84
if
(
threadIdx
.x == 0) {
85
nvshmemx_signal_op
(
notifyReady
, counter,
NVSHMEM_SIGNAL_SET
,
srcRank
);
86
nvshmem_signal_wait_until
(
notifyReady
,
NVSHMEM_CMP_EQ
, counter);
87
}
88
__syncthreads
();
89
90
// Push data
91
nvshmemx_float_put_signal_nbi_block
(
dest
+
block_offset
,
src
+
92
block_offset
,
dataSize
,
93
notifyDone
, counter,
94
NVSHMEM_SIGNAL_SET
,
destRank
);
95
}
96
}
54
__global__
void
pack_pushShmemKernel
(
const
float
*
__restrict__
u
, {
…
}
97
98
template
<>
99
__global__
void
pack_pushShmemKernel
(
const
double
*
__restrict__
u
,
100
double
*
dest
,
101
double
*
__restrict__
src
,
102
const
int
*
__restrict__
dof,
103
const
int
destRank
,
104
const
int
srcRank
,
105
const
int
n,
106
uint64_t
counter,
107
uint64_t
*
notifyDone
,
108
uint64_t
*
notifyReady
)
109
{
110
111
112
const
int
j
=
threadIdx
.x +
blockDim
.x *
blockIdx
.x;
113
114
if
(
j
< n) {
115
src
[
j
] =
u
[dof[
j
]-1];
116
}
117
__syncthreads
();
118
119
//TO DO: 1 block transfers seem best from initial investigations, check this more thoroughly
120
size_t
numBlocksForTransfer
= 1;
121
if
(
blockIdx
.x <
numBlocksForTransfer
)
122
{
123
size_t
n_per_block
= n/
numBlocksForTransfer
;
124
size_t
block_offset
=
n_per_block
*
blockIdx
.x;
125
size_t
dataSize
=
blockIdx
.x != (
numBlocksForTransfer
- 1) ?
126
n_per_block
:
max
(n -
block_offset
,
n_per_block
);
127
128
// Notify ready to sending rank, and wait until recieving rank is ready
129
if
(
threadIdx
.x == 0) {
130
nvshmemx_signal_op
(
notifyReady
, counter,
NVSHMEM_SIGNAL_SET
,
srcRank
);
131
nvshmem_signal_wait_until
(
notifyReady
,
NVSHMEM_CMP_EQ
, counter);
132
}
133
__syncthreads
();
134
135
// Push data
136
nvshmemx_double_put_signal_nbi_block
(
dest
+
block_offset
,
src
+
137
block_offset
,
dataSize
,
138
notifyDone
, counter,
139
NVSHMEM_SIGNAL_SET
,
destRank
);
140
}
141
}
99
__global__
void
pack_pushShmemKernel
(
const
double
*
__restrict__
u
, {
…
}
142
143
__global__
void
pushShmemKernelWait
(
uint64_t
counter,
144
uint64_t
*
notifyDone
)
145
{
146
// Notify done to receiving rank, and wait for data from sending rank
147
if
(
blockIdx
.x==0 &&
threadIdx
.x == 0) {
148
nvshmem_signal_wait_until
(
notifyDone
,
NVSHMEM_CMP_EQ
, counter);
149
}
150
}
143
__global__
void
pushShmemKernelWait
(
uint64_t
counter, {
…
}
151
152
153
154
155
#endif
u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
Definition
ax_helm_full_kernel.h:46
j
const int j
Definition
ax_helm_full_kernel.h:94
__syncthreads
__syncthreads()
dirichlet_apply_scalar_kernel
__global__ void dirichlet_apply_scalar_kernel(const int *__restrict__ msk, T *__restrict__ x, const T g, const int m)
Definition
dirichlet_kernel.h:42
pack_pushShmemKernel
__global__ void pack_pushShmemKernel(const T *__restrict__ u, T *dest, T *__restrict__ src, const int *__restrict__ dof, const int destRank, const int srcRank, const int n, uint64_t counter, uint64_t *notifyDone, uint64_t *notifyReady)
pushShmemKernelWait
__global__ void pushShmemKernelWait(uint64_t counter, uint64_t *notifyDone)
Definition
gs_nvshmem_kernels.h:143
max
#define max(a, b)
Definition
tensor.cu:40
src
gs
bcknd
device
cuda
gs_nvshmem_kernels.h
Generated on Fri Mar 14 2025 03:42:46 for Neko by
1.9.8