diff options
Diffstat (limited to 'lib/Headers/__clang_cuda_intrinsics.h')
-rw-r--r-- | lib/Headers/__clang_cuda_intrinsics.h | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h index b43ce21d0bb3..02d68a2e618e 100644 --- a/lib/Headers/__clang_cuda_intrinsics.h +++ b/lib/Headers/__clang_cuda_intrinsics.h @@ -92,6 +92,152 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +#if CUDA_VERSION >= 9000 +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +// __shfl_sync_* variants available in CUDA-9 +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask) \ + inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ + int __width = warpSize) { \ + return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(unsigned int __mask, float __val, \ + int __offset, int __width = warpSize) { \ + return __FloatIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __mask, \ + unsigned int __val, int __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned int>( \ + ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ + } \ + inline __device__ long long __FnName(unsigned int __mask, long long __val, \ + int __offset, int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__val) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__val, &__tmp, sizeof(__val)); \ + __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ + long long __ret; \ + memcpy(&__ret, &__tmp, sizeof(__tmp)); \ + return __ret; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned int __mask, unsigned long long __val, int __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned long long>(::__FnName( \ + __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ + } \ + inline __device__ long __FnName(unsigned int __mask, long __val, \ + int __offset, int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast<long>(::__FnName( \ + __mask, static_cast<long long>(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast<long>( \ + ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName(unsigned int __mask, \ + unsigned long __val, int __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned long>( \ + ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ + } \ + inline __device__ double __FnName(unsigned int __mask, double __val, \ + int __offset, int __width = warpSize) { \ + long long __tmp; \ + _Static_assert(sizeof(__tmp) == sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ + __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ + double __ret; \ + memcpy(&__ret, &__tmp, sizeof(__ret)); \ + return __ret; \ + } +__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, + __nvvm_shfl_sync_idx_f32, 0x1f); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, + __nvvm_shfl_sync_up_f32, 0); +__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, + __nvvm_shfl_sync_down_f32, 0x1f); +__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, + __nvvm_shfl_sync_bfly_f32, 0x1f); +#pragma pop_macro("__MAKE_SYNC_SHUFFLES") + +inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { + return __nvvm_bar_warp_sync(mask); +} + +inline __device__ void __barrier_sync(unsigned int id) { + __nvvm_barrier_sync(id); +} + +inline __device__ void __barrier_sync_count(unsigned int id, + unsigned int count) { + __nvvm_barrier_sync_cnt(id, count); +} + +inline __device__ int __all_sync(unsigned int mask, int pred) { + return __nvvm_vote_all_sync(mask, pred); +} + +inline __device__ int __any_sync(unsigned int mask, int pred) { + return __nvvm_vote_any_sync(mask, pred); +} + +inline __device__ int __uni_sync(unsigned int mask, int pred) { + return __nvvm_vote_uni_sync(mask, pred); +} + +inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { + return __nvvm_vote_ballot_sync(mask, pred); +} + +inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } + +inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { + return __nvvm_fns(mask, base, offset); +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +// Define __match* builtins CUDA-9 headers expect to see. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +inline __device__ unsigned int __match32_any_sync(unsigned int mask, + unsigned int value) { + return __nvvm_match_any_sync_i32(mask, value); +} + +inline __device__ unsigned long long +__match64_any_sync(unsigned int mask, unsigned long long value) { + return __nvvm_match_any_sync_i64(mask, value); +} + +inline __device__ unsigned int +__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { + return __nvvm_match_all_sync_i32p(mask, value, pred); +} + +inline __device__ unsigned long long +__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { + return __nvvm_match_all_sync_i64p(mask, value, pred); +} +#include "crt/sm_70_rt.hpp" + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#endif // __CUDA_VERSION >= 9000 + // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. // Prevent the vanilla sm_32 intrinsics header from being included. |