9 #ifndef __CLANG_CUDA_INTRINSICS_H__ 10 #define __CLANG_CUDA_INTRINSICS_H__ 12 #error "This file is for CUDA compilation only." 17 #define __SM_30_INTRINSICS_H__ 18 #define __SM_30_INTRINSICS_HPP__ 20 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 22 #pragma push_macro("__MAKE_SHUFFLES") 23 #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ 25 inline __device__ int __FnName(int __val, __Type __offset, \ 26 int __width = warpSize) { \ 27 return __IntIntrinsic(__val, __offset, \ 28 ((warpSize - __width) << 8) | (__Mask)); \ 30 inline __device__ float __FnName(float __val, __Type __offset, \ 31 int __width = warpSize) { \ 32 return __FloatIntrinsic(__val, __offset, \ 33 ((warpSize - __width) << 8) | (__Mask)); \ 35 inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ 36 int __width = warpSize) { \ 37 return static_cast<unsigned int>( \ 38 ::__FnName(static_cast<int>(__val), __offset, __width)); \ 40 inline __device__ long long __FnName(long long __val, __Type __offset, \ 41 int __width = warpSize) { \ 45 _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 46 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 48 memcpy(&__val, &__tmp, sizeof(__val)); \ 49 __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 50 __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 52 memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 55 inline __device__ long __FnName(long __val, __Type __offset, \ 56 int __width = warpSize) { \ 57 _Static_assert(sizeof(long) == sizeof(long long) || \ 58 sizeof(long) == sizeof(int)); \ 59 if (sizeof(long) == sizeof(long long)) { \ 60 return static_cast<long>( \ 61 ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 62 } else if (sizeof(long) == sizeof(int)) { \ 63 return static_cast<long>( \ 64 ::__FnName(static_cast<int>(__val), __offset, __width)); \ 67 inline __device__ unsigned long __FnName( \ 68 unsigned long __val, __Type __offset, int __width = warpSize) { \ 69 return static_cast<unsigned long>( \ 70 ::__FnName(static_cast<long>(__val), __offset, __width)); \ 72 inline __device__ unsigned long long __FnName( \ 73 unsigned long long __val, __Type __offset, int __width = warpSize) { \ 74 return static_cast<unsigned long long>(::__FnName( \ 75 static_cast<unsigned long long>(__val), __offset, __width)); \ 77 inline __device__ double __FnName(double __val, __Type __offset, \ 78 int __width = warpSize) { \ 80 _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 81 memcpy(&__tmp, &__val, sizeof(__val)); \ 82 __tmp = ::__FnName(__tmp, __offset, __width); \ 84 memcpy(&__ret, &__tmp, sizeof(__ret)); \ 88 __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f,
int);
93 __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
95 __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
97 #pragma pop_macro("__MAKE_SHUFFLES") 99 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 101 #if CUDA_VERSION >= 9000 102 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) 104 #pragma push_macro("__MAKE_SYNC_SHUFFLES") 105 #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ 107 inline __device__ int __FnName(unsigned int __mask, int __val, \ 108 __Type __offset, int __width = warpSize) { \ 109 return __IntIntrinsic(__mask, __val, __offset, \ 110 ((warpSize - __width) << 8) | (__Mask)); \ 112 inline __device__ float __FnName(unsigned int __mask, float __val, \ 113 __Type __offset, int __width = warpSize) { \ 114 return __FloatIntrinsic(__mask, __val, __offset, \ 115 ((warpSize - __width) << 8) | (__Mask)); \ 117 inline __device__ unsigned int __FnName(unsigned int __mask, \ 118 unsigned int __val, __Type __offset, \ 119 int __width = warpSize) { \ 120 return static_cast<unsigned int>( \ 121 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 123 inline __device__ long long __FnName(unsigned int __mask, long long __val, \ 125 int __width = warpSize) { \ 129 _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 130 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 132 memcpy(&__val, &__tmp, sizeof(__val)); \ 133 __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ 134 __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ 136 memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 139 inline __device__ unsigned long long __FnName( \ 140 unsigned int __mask, unsigned long long __val, __Type __offset, \ 141 int __width = warpSize) { \ 142 return static_cast<unsigned long long>(::__FnName( \ 143 __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ 145 inline __device__ long __FnName(unsigned int __mask, long __val, \ 146 __Type __offset, int __width = warpSize) { \ 147 _Static_assert(sizeof(long) == sizeof(long long) || \ 148 sizeof(long) == sizeof(int)); \ 149 if (sizeof(long) == sizeof(long long)) { \ 150 return static_cast<long>(::__FnName( \ 151 __mask, static_cast<long long>(__val), __offset, __width)); \ 152 } else if (sizeof(long) == sizeof(int)) { \ 153 return static_cast<long>( \ 154 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 157 inline __device__ unsigned long __FnName( \ 158 unsigned int __mask, unsigned long __val, __Type __offset, \ 159 int __width = warpSize) { \ 160 return static_cast<unsigned long>( \ 161 ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ 163 inline __device__ double __FnName(unsigned int __mask, double __val, \ 164 __Type __offset, int __width = warpSize) { \ 166 _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 167 memcpy(&__tmp, &__val, sizeof(__val)); \ 168 __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ 170 memcpy(&__ret, &__tmp, sizeof(__ret)); \ 173 __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
174 __nvvm_shfl_sync_idx_f32, 0x1f,
int);
177 __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
178 __nvvm_shfl_sync_up_f32, 0,
unsigned int);
179 __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
180 __nvvm_shfl_sync_down_f32, 0x1f,
unsigned int);
181 __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
182 __nvvm_shfl_sync_bfly_f32, 0x1f,
int);
183 #pragma pop_macro("__MAKE_SYNC_SHUFFLES") 185 inline __device__
void __syncwarp(
unsigned int mask = 0xffffffff) {
186 return __nvvm_bar_warp_sync(
mask);
189 inline __device__
void __barrier_sync(
unsigned int id) {
190 __nvvm_barrier_sync(
id);
193 inline __device__
void __barrier_sync_count(
unsigned int id,
194 unsigned int count) {
195 __nvvm_barrier_sync_cnt(
id, count);
198 inline __device__
int __all_sync(
unsigned int mask,
int pred) {
199 return __nvvm_vote_all_sync(mask, pred);
202 inline __device__
int __any_sync(
unsigned int mask,
int pred) {
203 return __nvvm_vote_any_sync(mask, pred);
206 inline __device__
int __uni_sync(
unsigned int mask,
int pred) {
207 return __nvvm_vote_uni_sync(mask, pred);
210 inline __device__
unsigned int __ballot_sync(
unsigned int mask,
int pred) {
211 return __nvvm_vote_ballot_sync(mask, pred);
214 inline __device__
unsigned int __activemask() {
return __nvvm_vote_ballot(1); }
216 inline __device__
unsigned int __fns(
unsigned mask,
unsigned base,
int offset) {
217 return __nvvm_fns(mask, base, offset);
220 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 223 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 224 inline __device__
unsigned int __match32_any_sync(
unsigned int mask,
225 unsigned int value) {
226 return __nvvm_match_any_sync_i32(mask, value);
229 inline __device__
unsigned long long 230 __match64_any_sync(
unsigned int mask,
unsigned long long value) {
231 return __nvvm_match_any_sync_i64(mask, value);
234 inline __device__
unsigned int 235 __match32_all_sync(
unsigned int mask,
unsigned int value,
int *pred) {
236 return __nvvm_match_all_sync_i32p(mask, value, pred);
239 inline __device__
unsigned long long 240 __match64_all_sync(
unsigned int mask,
unsigned long long value,
int *pred) {
241 return __nvvm_match_all_sync_i64p(mask, value, pred);
243 #include "crt/sm_70_rt.hpp" 245 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 246 #endif // __CUDA_VERSION >= 9000 251 #define __SM_32_INTRINSICS_H__ 252 #define __SM_32_INTRINSICS_HPP__ 254 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 256 inline __device__
char __ldg(
const char *ptr) {
return __nvvm_ldg_c(ptr); }
257 inline __device__
short __ldg(
const short *ptr) {
return __nvvm_ldg_s(ptr); }
258 inline __device__
int __ldg(
const int *ptr) {
return __nvvm_ldg_i(ptr); }
259 inline __device__
long __ldg(
const long *ptr) {
return __nvvm_ldg_l(ptr); }
260 inline __device__
long long __ldg(
const long long *ptr) {
261 return __nvvm_ldg_ll(ptr);
263 inline __device__
unsigned char __ldg(
const unsigned char *ptr) {
264 return __nvvm_ldg_uc(ptr);
266 inline __device__
signed char __ldg(
const signed char *ptr) {
267 return __nvvm_ldg_uc((
const unsigned char *)ptr);
269 inline __device__
unsigned short __ldg(
const unsigned short *ptr) {
270 return __nvvm_ldg_us(ptr);
272 inline __device__
unsigned int __ldg(
const unsigned int *ptr) {
273 return __nvvm_ldg_ui(ptr);
275 inline __device__
unsigned long __ldg(
const unsigned long *ptr) {
276 return __nvvm_ldg_ul(ptr);
278 inline __device__
unsigned long long __ldg(
const unsigned long long *ptr) {
279 return __nvvm_ldg_ull(ptr);
281 inline __device__
float __ldg(
const float *ptr) {
return __nvvm_ldg_f(ptr); }
282 inline __device__
double __ldg(
const double *ptr) {
return __nvvm_ldg_d(ptr); }
284 inline __device__ char2
__ldg(
const char2 *ptr) {
289 c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
295 inline __device__ char4
__ldg(
const char4 *ptr) {
297 c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
305 inline __device__ short2
__ldg(
const short2 *ptr) {
307 s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
313 inline __device__ short4
__ldg(
const short4 *ptr) {
315 s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
323 inline __device__ int2
__ldg(
const int2 *ptr) {
325 i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
331 inline __device__ int4
__ldg(
const int4 *ptr) {
333 i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
341 inline __device__ longlong2
__ldg(
const longlong2 *ptr) {
343 ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
350 inline __device__ uchar2
__ldg(
const uchar2 *ptr) {
351 typedef unsigned char uc2
__attribute__((ext_vector_type(2)));
352 uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
358 inline __device__ uchar4
__ldg(
const uchar4 *ptr) {
359 typedef unsigned char uc4
__attribute__((ext_vector_type(4)));
360 uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
368 inline __device__ ushort2
__ldg(
const ushort2 *ptr) {
369 typedef unsigned short us2
__attribute__((ext_vector_type(2)));
370 us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
376 inline __device__ ushort4
__ldg(
const ushort4 *ptr) {
377 typedef unsigned short us4
__attribute__((ext_vector_type(4)));
378 us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
386 inline __device__ uint2
__ldg(
const uint2 *ptr) {
387 typedef unsigned int ui2
__attribute__((ext_vector_type(2)));
388 ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
394 inline __device__ uint4
__ldg(
const uint4 *ptr) {
395 typedef unsigned int ui4
__attribute__((ext_vector_type(4)));
396 ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
404 inline __device__ ulonglong2
__ldg(
const ulonglong2 *ptr) {
405 typedef unsigned long long ull2
__attribute__((ext_vector_type(2)));
406 ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
413 inline __device__ float2
__ldg(
const float2 *ptr) {
415 f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
421 inline __device__ float4
__ldg(
const float4 *ptr) {
423 f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
431 inline __device__ double2
__ldg(
const double2 *ptr) {
433 d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
444 unsigned shiftWidth) {
446 asm(
"shf.l.wrap.b32 %0, %1, %2, %3;" 448 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
452 unsigned shiftWidth) {
454 asm(
"shf.l.clamp.b32 %0, %1, %2, %3;" 456 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
460 unsigned shiftWidth) {
462 asm(
"shf.r.wrap.b32 %0, %1, %2, %3;" 464 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
468 unsigned shiftWidth) {
470 asm(
"shf.r.clamp.b32 %0, %1, %2, %3;" 472 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
476 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 478 #endif // defined(__CLANG_CUDA_INTRINSICS_H__) vector signed char unaligned_vec_schar __attribute__((aligned(1)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ char __ldg(const char *ptr)
__device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, unsigned shiftWidth)
__device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, unsigned shiftWidth)
__device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, unsigned shiftWidth)
__device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, unsigned shiftWidth)
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, __Type)