1 #ifndef __MATH_MATHOPS_KERNEL_H__
2 #define __MATH_MATHOPS_KERNEL_H__
47 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
48 const int str = blockDim.x * gridDim.x;
51 for (
int i = idx;
i < n;
i += str) {
58 for (
int i = idx;
i < n;
i += str) {
73 const T * __restrict__ c,
77 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
78 const int str = blockDim.x * gridDim.x;
81 for (
int i = idx;
i < n;
i += str) {
88 for (
int i = idx;
i < n;
i += str) {
103 const T * __restrict__ b1,
104 const T * __restrict__ b2,
105 const T * __restrict__ b3,
106 const T * __restrict__ c,
111 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
112 const int str = blockDim.x * gridDim.x;
115 for (
int i = idx;
i < n;
i += str) {
116 a1[
i] = b1[
i] * c[
i] * d;
117 a2[
i] = b2[
i] * c[
i] * d;
118 a3[
i] = b3[
i] * c[
i] * d;
122 for (
int i = idx;
i < n;
i += str) {
123 a1[
i] = b1[
i] * c[
i] * d;
124 a2[
i] = b2[
i] * c[
i] * d;
133 template<
typename T>
137 const T * __restrict__ b1,
138 const T * __restrict__ b2,
139 const T * __restrict__ b3,
144 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
145 const int str = blockDim.x * gridDim.x;
148 for (
int i = idx;
i < n;
i += str) {
149 a1[
i] = a1[
i] + b1[
i] * c;
150 a2[
i] = a2[
i] + b2[
i] * c;
151 a3[
i] = a3[
i] + b3[
i] * c;
155 for (
int i = idx;
i < n;
i += str) {
156 a1[
i] = a1[
i] + b1[
i] * c;
157 a2[
i] = a2[
i] + b2[
i] * c;
167 template<
typename T>
171 const T * __restrict__ b1,
172 const T * __restrict__ b2,
173 const T * __restrict__ b3,
174 const T * __restrict__ c,
178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
179 const int str = blockDim.x * gridDim.x;
182 for (
int i = idx;
i < n;
i += str) {
183 a1[
i] = a1[
i] + b1[
i] * c[
i];
184 a2[
i] = a2[
i] + b2[
i] * c[
i];
185 a3[
i] = a3[
i] + b3[
i] * c[
i];
189 for (
int i = idx;
i < n;
i += str) {
190 a1[
i] = a1[
i] + b1[
i] * c[
i];
191 a2[
i] = a2[
i] + b2[
i] * c[
i];
__global__ void opcolv_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ c, const int gdim, const int n)
__global__ void opchsign_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const int gdim, const int n)
__global__ void opadd2col_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ b1, const T *__restrict__ b2, const T *__restrict__ b3, const T *__restrict__ c, const int gdim, const int n)
__global__ void opcolv3c_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ b1, const T *__restrict__ b2, const T *__restrict__ b3, const T *__restrict__ c, const T d, const int gdim, const int n)
__global__ void opadd2cm_kernel(T *__restrict__ a1, T *__restrict__ a2, T *__restrict__ a3, const T *__restrict__ b1, const T *__restrict__ b2, const T *__restrict__ b3, const T c, const int gdim, const int n)