diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers/smmintrin.h')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Headers/smmintrin.h | 181 |
1 files changed, 101 insertions, 80 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h index 04bd0722b11f..69ad07f42ad6 100644 --- a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h @@ -24,14 +24,10 @@ #ifndef _SMMINTRIN_H #define _SMMINTRIN_H -#ifndef __SSE4_1__ -#error "SSE4.1 instruction set not enabled" -#else - #include <tmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 @@ -61,35 +57,28 @@ #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) #define _mm_round_ps(X, M) __extension__ ({ \ - __m128 __X = (X); \ - (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); }) + (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) #define _mm_round_ss(X, Y, M) __extension__ ({ \ - __m128 __X = (X); \ - __m128 __Y = (Y); \ - (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); }) + (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M)); }) #define _mm_round_pd(X, M) __extension__ ({ \ - __m128d __X = (X); \ - (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); }) + (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) #define _mm_round_sd(X, Y, M) __extension__ ({ \ - __m128d __X = (X); \ - __m128d __Y = (Y); \ - (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); }) + (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Packed Blending Intrinsics. */ #define _mm_blend_pd(V1, V2, M) __extension__ ({ \ - __m128d __V1 = (V1); \ - __m128d __V2 = (V2); \ - (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \ + (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), \ (((M) & 0x01) ? 2 : 0), \ (((M) & 0x02) ? 3 : 1)); }) #define _mm_blend_ps(V1, V2, M) __extension__ ({ \ - __m128 __V1 = (V1); \ - __m128 __V2 = (V2); \ - (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \ + (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ (((M) & 0x01) ? 4 : 0), \ (((M) & 0x02) ? 5 : 1), \ (((M) & 0x04) ? 6 : 2), \ @@ -117,9 +106,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) } #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ - __m128i __V1 = (V1); \ - __m128i __V2 = (V2); \ - (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \ + (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), \ (((M) & 0x01) ? 8 : 0), \ (((M) & 0x02) ? 9 : 1), \ (((M) & 0x04) ? 10 : 2), \ @@ -144,20 +132,18 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /* SSE4 Floating Point Dot Product Instructions. */ #define _mm_dp_ps(X, Y, M) __extension__ ({ \ - __m128 __X = (X); \ - __m128 __Y = (Y); \ - (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); }) + (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M)); }) #define _mm_dp_pd(X, Y, M) __extension__ ({\ - __m128d __X = (X); \ - __m128d __Y = (Y); \ - (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); }) + (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Streaming Load Hint Instruction. */ static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_stream_load_si128 (__m128i *__V) +_mm_stream_load_si128 (__m128i const *__V) { - return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V); + return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V); } /* SSE4 Packed Integer Min/Max Instructions. */ @@ -213,7 +199,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) #define _mm_extract_ps(X, N) (__extension__ \ ({ union { int __i; float __f; } __t; \ - __v4sf __a = (__v4sf)(X); \ + __v4sf __a = (__v4sf)(__m128)(X); \ __t.__f = __a[(N) & 3]; \ __t.__i;})) @@ -221,39 +207,44 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /* Extract a single-precision float from X at index N into D. */ #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ (D) = __a[N]; })) - + /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create an index suitable for _mm_insert_ps. */ #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) - + /* Extract a float from X at index N into the first index of the return. */ #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) - + /* Insert int into packed integer array at index. */ -#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ - __a[(N) & 15] = (I); \ - __a;})) -#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ - __a[(N) & 3] = (I); \ - __a;})) +#define _mm_insert_epi8(X, I, N) (__extension__ \ + ({ __v16qi __a = (__v16qi)(__m128i)(X); \ + __a[(N) & 15] = (I); \ + __a;})) +#define _mm_insert_epi32(X, I, N) (__extension__ \ + ({ __v4si __a = (__v4si)(__m128i)(X); \ + __a[(N) & 3] = (I); \ + __a;})) #ifdef __x86_64__ -#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ - __a[(N) & 1] = (I); \ - __a;})) +#define _mm_insert_epi64(X, I, N) (__extension__ \ + ({ __v2di __a = (__v2di)(__m128i)(X); \ + __a[(N) & 1] = (I); \ + __a;})) #endif /* __x86_64__ */ /* Extract int from packed integer array at index. This returns the element * as a zero extended value, so it is unsigned. */ -#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ - (int)(unsigned char) \ - __a[(N) & 15];})) -#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ - __a[(N) & 3];})) +#define _mm_extract_epi8(X, N) (__extension__ \ + ({ __v16qi __a = (__v16qi)(__m128i)(X); \ + (int)(unsigned char) __a[(N) & 15];})) +#define _mm_extract_epi32(X, N) (__extension__ \ + ({ __v4si __a = (__v4si)(__m128i)(X); \ + (int)__a[(N) & 3];})) #ifdef __x86_64__ -#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ - __a[(N) & 1];})) +#define _mm_extract_epi64(X, N) (__extension__ \ + ({ __v2di __a = (__v2di)(__m128i)(X); \ + (long long)__a[(N) & 1];})) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ @@ -290,37 +281,44 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + typedef signed char __v16qs __attribute__((__vector_size__(16))); + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V); + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { - return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V); + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); } /* SSE4 Packed Integer Zero-Extension. */ @@ -369,9 +367,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) /* SSE4 Multiple Packed Sums of Absolute Difference. */ #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ - __m128i __X = (X); \ - __m128i __Y = (Y); \ - (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); }) + (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (M)); }) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) @@ -379,9 +376,13 @@ _mm_minpos_epu16(__m128i __V) return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); } +/* Handle the sse4.2 definitions here. */ + /* These definitions are normally in nmmintrin.h, but gcc puts them in here so we'll do the same. */ -#ifdef __SSE4_2__ + +#undef __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) /* These specify the type of data that we're comparing. */ #define _SIDD_UBYTE_OPS 0x00 @@ -410,36 +411,59 @@ _mm_minpos_epu16(__m128i __V) #define _SIDD_UNIT_MASK 0x40 /* SSE4.2 Packed Comparison Intrinsics. */ -#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M)) -#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M)) +#define _mm_cmpistrm(A, B, M) \ + (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) +#define _mm_cmpistri(A, B, M) \ + (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpestrm(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M)) + (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) #define _mm_cmpestri(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M)) - + (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) + /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ #define _mm_cmpistra(A, B, M) \ - __builtin_ia32_pcmpistria128((A), (B), (M)) + (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpistrc(A, B, M) \ - __builtin_ia32_pcmpistric128((A), (B), (M)) + (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpistro(A, B, M) \ - __builtin_ia32_pcmpistrio128((A), (B), (M)) + (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpistrs(A, B, M) \ - __builtin_ia32_pcmpistris128((A), (B), (M)) + (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpistrz(A, B, M) \ - __builtin_ia32_pcmpistriz128((A), (B), (M)) + (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M)) #define _mm_cmpestra(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M)) + (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) #define _mm_cmpestrc(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M)) + (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) #define _mm_cmpestro(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M)) + (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) #define _mm_cmpestrs(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M)) + (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) #define _mm_cmpestrz(A, LA, B, LB, M) \ - __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) + (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M)) /* SSE4.2 Compare Packed Data -- Greater Than. */ static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -481,7 +505,4 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D) #include <popcntintrin.h> #endif -#endif /* __SSE4_2__ */ -#endif /* __SSE4_1__ */ - #endif /* _SMMINTRIN_H */ |