aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/tools/clang/lib/Headers/smmintrin.h
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers/smmintrin.h')
-rw-r--r--contrib/llvm/tools/clang/lib/Headers/smmintrin.h181
1 files changed, 101 insertions, 80 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h
index 04bd0722b11f..69ad07f42ad6 100644
--- a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h
+++ b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h
@@ -24,14 +24,10 @@
#ifndef _SMMINTRIN_H
#define _SMMINTRIN_H
-#ifndef __SSE4_1__
-#error "SSE4.1 instruction set not enabled"
-#else
-
#include <tmmintrin.h>
/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
/* SSE4 Rounding macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
@@ -61,35 +57,28 @@
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
#define _mm_round_ps(X, M) __extension__ ({ \
- __m128 __X = (X); \
- (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
+ (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
#define _mm_round_ss(X, Y, M) __extension__ ({ \
- __m128 __X = (X); \
- __m128 __Y = (Y); \
- (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
+ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (M)); })
#define _mm_round_pd(X, M) __extension__ ({ \
- __m128d __X = (X); \
- (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
+ (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
#define _mm_round_sd(X, Y, M) __extension__ ({ \
- __m128d __X = (X); \
- __m128d __Y = (Y); \
- (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
+ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (M)); })
/* SSE4 Packed Blending Intrinsics. */
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
- __m128d __V1 = (V1); \
- __m128d __V2 = (V2); \
- (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
+ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+ (__v2df)(__m128d)(V2), \
(((M) & 0x01) ? 2 : 0), \
(((M) & 0x02) ? 3 : 1)); })
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
- __m128 __V1 = (V1); \
- __m128 __V2 = (V2); \
- (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
+ (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
(((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \
@@ -117,9 +106,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
}
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
- __m128i __V1 = (V1); \
- __m128i __V2 = (V2); \
- (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
+ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+ (__v8hi)(__m128i)(V2), \
(((M) & 0x01) ? 8 : 0), \
(((M) & 0x02) ? 9 : 1), \
(((M) & 0x04) ? 10 : 2), \
@@ -144,20 +132,18 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/* SSE4 Floating Point Dot Product Instructions. */
#define _mm_dp_ps(X, Y, M) __extension__ ({ \
- __m128 __X = (X); \
- __m128 __Y = (Y); \
- (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
+ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (M)); })
#define _mm_dp_pd(X, Y, M) __extension__ ({\
- __m128d __X = (X); \
- __m128d __Y = (Y); \
- (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
+ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (M)); })
/* SSE4 Streaming Load Hint Instruction. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128 (__m128i *__V)
+_mm_stream_load_si128 (__m128i const *__V)
{
- return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
+ return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
}
/* SSE4 Packed Integer Min/Max Instructions. */
@@ -213,7 +199,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
#define _mm_extract_ps(X, N) (__extension__ \
({ union { int __i; float __f; } __t; \
- __v4sf __a = (__v4sf)(X); \
+ __v4sf __a = (__v4sf)(__m128)(X); \
__t.__f = __a[(N) & 3]; \
__t.__i;}))
@@ -221,39 +207,44 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/* Extract a single-precision float from X at index N into D. */
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
(D) = __a[N]; }))
-
+
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
an index suitable for _mm_insert_ps. */
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
-
+
/* Extract a float from X at index N into the first index of the return. */
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
-
+
/* Insert int into packed integer array at index. */
-#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
- __a[(N) & 15] = (I); \
- __a;}))
-#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
- __a[(N) & 3] = (I); \
- __a;}))
+#define _mm_insert_epi8(X, I, N) (__extension__ \
+ ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+ __a[(N) & 15] = (I); \
+ __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__ \
+ ({ __v4si __a = (__v4si)(__m128i)(X); \
+ __a[(N) & 3] = (I); \
+ __a;}))
#ifdef __x86_64__
-#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
- __a[(N) & 1] = (I); \
- __a;}))
+#define _mm_insert_epi64(X, I, N) (__extension__ \
+ ({ __v2di __a = (__v2di)(__m128i)(X); \
+ __a[(N) & 1] = (I); \
+ __a;}))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
* as a zero extended value, so it is unsigned.
*/
-#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
- (int)(unsigned char) \
- __a[(N) & 15];}))
-#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
- __a[(N) & 3];}))
+#define _mm_extract_epi8(X, N) (__extension__ \
+ ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+ (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__ \
+ ({ __v4si __a = (__v4si)(__m128i)(X); \
+ (int)__a[(N) & 3];}))
#ifdef __x86_64__
-#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
- __a[(N) & 1];}))
+#define _mm_extract_epi64(X, N) (__extension__ \
+ ({ __v2di __a = (__v2di)(__m128i)(X); \
+ (long long)__a[(N) & 1];}))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
@@ -290,37 +281,44 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi16(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi32(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi64(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ typedef signed char __v16qs __attribute__((__vector_size__(16)));
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi32(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi64(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi32_epi64(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
}
/* SSE4 Packed Integer Zero-Extension. */
@@ -369,9 +367,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/* SSE4 Multiple Packed Sums of Absolute Difference. */
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
- __m128i __X = (X); \
- __m128i __Y = (Y); \
- (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
+ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (M)); })
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_minpos_epu16(__m128i __V)
@@ -379,9 +376,13 @@ _mm_minpos_epu16(__m128i __V)
return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
}
+/* Handle the sse4.2 definitions here. */
+
/* These definitions are normally in nmmintrin.h, but gcc puts them in here
so we'll do the same. */
-#ifdef __SSE4_2__
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
/* These specify the type of data that we're comparing. */
#define _SIDD_UBYTE_OPS 0x00
@@ -410,36 +411,59 @@ _mm_minpos_epu16(__m128i __V)
#define _SIDD_UNIT_MASK 0x40
/* SSE4.2 Packed Comparison Intrinsics. */
-#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
-#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+#define _mm_cmpistrm(A, B, M) \
+ (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+ (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestrm(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+ (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestri(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
-
+ (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
+
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
#define _mm_cmpistra(A, B, M) \
- __builtin_ia32_pcmpistria128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrc(A, B, M) \
- __builtin_ia32_pcmpistric128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistro(A, B, M) \
- __builtin_ia32_pcmpistrio128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrs(A, B, M) \
- __builtin_ia32_pcmpistris128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrz(A, B, M) \
- __builtin_ia32_pcmpistriz128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestra(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrc(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestro(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrs(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrz(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
/* SSE4.2 Compare Packed Data -- Greater Than. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -481,7 +505,4 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
#include <popcntintrin.h>
#endif
-#endif /* __SSE4_2__ */
-#endif /* __SSE4_1__ */
-
#endif /* _SMMINTRIN_H */