aboutsummaryrefslogtreecommitdiff
path: root/lib/Headers/__clang_cuda_intrinsics.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Headers/__clang_cuda_intrinsics.h')
-rw-r--r--lib/Headers/__clang_cuda_intrinsics.h146
1 files changed, 146 insertions, 0 deletions
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h
index b43ce21d0bb3..02d68a2e618e 100644
--- a/lib/Headers/__clang_cuda_intrinsics.h
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -92,6 +92,152 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+#if CUDA_VERSION >= 9000
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+// __shfl_sync_* variants available in CUDA-9
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
+ __Mask) \
+ inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
+ int __width = warpSize) { \
+ return __IntIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ float __FnName(unsigned int __mask, float __val, \
+ int __offset, int __width = warpSize) { \
+ return __FloatIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ unsigned int __FnName(unsigned int __mask, \
+ unsigned int __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned int>( \
+ ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
+ } \
+ inline __device__ long long __FnName(unsigned int __mask, long long __val, \
+ int __offset, int __width = warpSize) { \
+ struct __Bits { \
+ int __a, __b; \
+ }; \
+ _Static_assert(sizeof(__val) == sizeof(__Bits)); \
+ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
+ __Bits __tmp; \
+ memcpy(&__val, &__tmp, sizeof(__val)); \
+ __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
+ __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
+ long long __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__tmp)); \
+ return __ret; \
+ } \
+ inline __device__ unsigned long long __FnName( \
+ unsigned int __mask, unsigned long long __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned long long>(::__FnName( \
+ __mask, static_cast<unsigned long long>(__val), __offset, __width)); \
+ } \
+ inline __device__ long __FnName(unsigned int __mask, long __val, \
+ int __offset, int __width = warpSize) { \
+ _Static_assert(sizeof(long) == sizeof(long long) || \
+ sizeof(long) == sizeof(int)); \
+ if (sizeof(long) == sizeof(long long)) { \
+ return static_cast<long>(::__FnName( \
+ __mask, static_cast<long long>(__val), __offset, __width)); \
+ } else if (sizeof(long) == sizeof(int)) { \
+ return static_cast<long>( \
+ ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
+ } \
+ } \
+ inline __device__ unsigned long __FnName(unsigned int __mask, \
+ unsigned long __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned long>( \
+ ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
+ } \
+ inline __device__ double __FnName(unsigned int __mask, double __val, \
+ int __offset, int __width = warpSize) { \
+ long long __tmp; \
+ _Static_assert(sizeof(__tmp) == sizeof(__val)); \
+ memcpy(&__tmp, &__val, sizeof(__val)); \
+ __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
+ double __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__ret)); \
+ return __ret; \
+ }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+ __nvvm_shfl_sync_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
+ __nvvm_shfl_sync_up_f32, 0);
+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
+ __nvvm_shfl_sync_down_f32, 0x1f);
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
+ __nvvm_shfl_sync_bfly_f32, 0x1f);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
+ return __nvvm_bar_warp_sync(mask);
+}
+
+inline __device__ void __barrier_sync(unsigned int id) {
+ __nvvm_barrier_sync(id);
+}
+
+inline __device__ void __barrier_sync_count(unsigned int id,
+ unsigned int count) {
+ __nvvm_barrier_sync_cnt(id, count);
+}
+
+inline __device__ int __all_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_all_sync(mask, pred);
+}
+
+inline __device__ int __any_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_any_sync(mask, pred);
+}
+
+inline __device__ int __uni_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_uni_sync(mask, pred);
+}
+
+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+
+inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
+ return __nvvm_fns(mask, base, offset);
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// Define __match* builtins CUDA-9 headers expect to see.
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
+ unsigned int value) {
+ return __nvvm_match_any_sync_i32(mask, value);
+}
+
+inline __device__ unsigned long long
+__match64_any_sync(unsigned int mask, unsigned long long value) {
+ return __nvvm_match_any_sync_i64(mask, value);
+}
+
+inline __device__ unsigned int
+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
+ return __nvvm_match_all_sync_i32p(mask, value, pred);
+}
+
+inline __device__ unsigned long long
+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
+ return __nvvm_match_all_sync_i64p(mask, value, pred);
+}
+#include "crt/sm_70_rt.hpp"
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#endif // __CUDA_VERSION >= 9000
+
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.