aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/ARM/vuzp.ll
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-02 19:17:04 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-02 19:17:04 +0000
commitb915e9e0fc85ba6f398b3fab0db6a81a8913af94 (patch)
tree98b8f811c7aff2547cab8642daf372d6c59502fb /test/CodeGen/ARM/vuzp.ll
parent6421cca32f69ac849537a3cff78c352195e99f1b (diff)
downloadsrc-b915e9e0fc85ba6f398b3fab0db6a81a8913af94.tar.gz
src-b915e9e0fc85ba6f398b3fab0db6a81a8913af94.zip
Vendor import of llvm trunk r290819:vendor/llvm/llvm-trunk-r290819
Notes
Notes: svn path=/vendor/llvm/dist/; revision=311116 svn path=/vendor/llvm/llvm-trunk-r290819/; revision=311117; tag=vendor/llvm/llvm-trunk-r290819
Diffstat (limited to 'test/CodeGen/ARM/vuzp.ll')
-rw-r--r--test/CodeGen/ARM/vuzp.ll291
1 files changed, 245 insertions, 46 deletions
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index 04499e77fde6..a83a4df5490c 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,25 +7,25 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.8 d17, d16
-; CHECK-NEXT: vadd.i8 d16, d17, d16
+; CHECK-NEXT: vmul.i8 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
- %tmp5 = add <8 x i8> %tmp3, %tmp4
+ %tmp5 = mul <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}
define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8_Qres:
; CHECK: @ BB#0:
-; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]]
-; CHECK-NEXT: vmov r0, r1, [[LDR0]]
-; CHECK-NEXT: vmov r2, r3, [[LDR1]]
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vuzp.8 d16, d17
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -38,25 +39,25 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.16 d17, d16
-; CHECK-NEXT: vadd.i16 d16, d17, d16
+; CHECK-NEXT: vmul.i16 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
- %tmp5 = add <4 x i16> %tmp3, %tmp4
+ %tmp5 = mul <4 x i16> %tmp3, %tmp4
ret <4 x i16> %tmp5
}
define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
; CHECK-LABEL: vuzpi16_Qres:
; CHECK: @ BB#0:
-; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]]
-; CHECK-NEXT: vmov r0, r1, [[LDR0]]
-; CHECK-NEXT: vmov r2, r3, [[LDR1]]
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vuzp.16 d16, d17
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -206,25 +207,25 @@ define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.8 d17, d16
-; CHECK-NEXT: vadd.i8 d16, d17, d16
+; CHECK-NEXT: vmul.i8 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
- %tmp5 = add <8 x i8> %tmp3, %tmp4
+ %tmp5 = mul <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}
define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8_undef_Qres:
; CHECK: @ BB#0:
-; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]]
-; CHECK-NEXT: vmov r0, r1, [[LDR0]]
-; CHECK-NEXT: vmov r2, r3, [[LDR1]]
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vuzp.8 d16, d17
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -266,9 +267,16 @@ define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
+; CHECK-LABEL: vuzp_lower_shufflemask_undef:
+; CHECK: @ BB#0: @ %entry
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vorr q9, q8, q8
+; CHECK-NEXT: vuzp.16 q8, q9
+; CHECK-NEXT: vmov r0, r1, d18
+; CHECK-NEXT: vmov r2, r3, d19
+; CHECK-NEXT: mov pc, lr
entry:
- ; CHECK-LABEL: vuzp_lower_shufflemask_undef
- ; CHECK: vuzp
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
@@ -276,10 +284,17 @@ entry:
}
define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
+; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
+; CHECK: @ BB#0: @ %entry
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vdup.32 q9, d16[0]
+; CHECK-NEXT: vuzp.32 q8, q9
+; CHECK-NEXT: vext.32 q8, q9, q9, #2
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
entry:
- ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
- ; CHECK-NOT: vtrn
- ; CHECK: vuzp
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
@@ -287,10 +302,15 @@ entry:
}
define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
+; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
+; CHECK: @ BB#0: @ %entry
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vrev64.32 q9, q8
+; CHECK-NEXT: vuzp.32 q8, q9
+; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
+; CHECK-NEXT: mov pc, lr
entry:
- ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
- ; CHECK-NOT: vtrn
- ; CHECK: vuzp
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
@@ -302,11 +322,33 @@ define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x
; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
-; CHECK-LABEL: vuzp_trunc
-; CHECK: vmovn.i32
-; CHECK: vmovn.i32
-; CHECK: vuzp
-; CHECK: vbsl
+; CHECK-LABEL: vuzp_trunc:
+; CHECK: @ BB#0:
+; CHECK-NEXT: .save {r4, r5, r11, lr}
+; CHECK-NEXT: push {r4, r5, r11, lr}
+; CHECK-NEXT: add r12, sp, #48
+; CHECK-NEXT: add lr, sp, #16
+; CHECK-NEXT: add r4, sp, #64
+; CHECK-NEXT: add r5, sp, #32
+; CHECK-NEXT: vld1.64 {d16, d17}, [r5]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r4]
+; CHECK-NEXT: vld1.64 {d20, d21}, [lr]
+; CHECK-NEXT: vld1.64 {d22, d23}, [r12]
+; CHECK-NEXT: vcgt.u32 q8, q9, q8
+; CHECK-NEXT: vcgt.u32 q9, q11, q10
+; CHECK-NEXT: vmovn.i32 d16, q8
+; CHECK-NEXT: vmovn.i32 d17, q9
+; CHECK-NEXT: vmov.i8 d18, #0x7
+; CHECK-NEXT: vmov d19, r0, r1
+; CHECK-NEXT: vuzp.8 d17, d16
+; CHECK-NEXT: vneg.s8 d16, d18
+; CHECK-NEXT: vshl.i8 d17, d17, #7
+; CHECK-NEXT: vmov d18, r2, r3
+; CHECK-NEXT: vshl.s8 d16, d17, d16
+; CHECK-NEXT: vbsl d16, d19, d18
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: pop {r4, r5, r11, lr}
+; CHECK-NEXT: mov pc, lr
%c = icmp ult <8 x i32> %cmp0, %cmp1
%res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
ret <8 x i8> %res
@@ -316,11 +358,31 @@ define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x
; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
; to perform the vuzp and get the vbsl mask.
define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
+; CHECK-LABEL: vuzp_trunc_and_shuffle:
+; CHECK: @ BB#0:
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: ldr r12, [sp, #40]
+; CHECK-NEXT: add lr, sp, #24
+; CHECK-NEXT: add r4, sp, #8
+; CHECK-NEXT: vld1.64 {d16, d17}, [r4]
+; CHECK-NEXT: vld1.64 {d18, d19}, [lr]
+; CHECK-NEXT: vld1.32 {d20[0]}, [r12:32]
+; CHECK-NEXT: vcgt.u32 q8, q9, q8
+; CHECK-NEXT: vmovn.i32 d16, q8
+; CHECK-NEXT: vmov.i8 d17, #0x7
+; CHECK-NEXT: vneg.s8 d17, d17
+; CHECK-NEXT: vmovl.u8 q9, d20
+; CHECK-NEXT: vuzp.8 d16, d18
+; CHECK-NEXT: vshl.i8 d16, d16, #7
+; CHECK-NEXT: vmov d18, r2, r3
+; CHECK-NEXT: vmov d19, r0, r1
+; CHECK-NEXT: vshl.s8 d16, d16, d17
+; CHECK-NEXT: vbsl d16, d19, d18
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: pop {r4, lr}
+; CHECK-NEXT: mov pc, lr
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
-; CHECK-LABEL: vuzp_trunc_and_shuffle
-; CHECK: vmovl
-; CHECK: vuzp
-; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
@@ -332,10 +394,28 @@ define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
; This produces a build_vector with some of the operands undefs.
define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
+; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
+; CHECK: @ BB#0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: add r12, sp, #24
+; CHECK-NEXT: add lr, sp, #8
+; CHECK-NEXT: vld1.64 {d16, d17}, [lr]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT: vcgt.u32 q8, q9, q8
+; CHECK-NEXT: vmov d19, r0, r1
+; CHECK-NEXT: vmovn.i32 d16, q8
+; CHECK-NEXT: vmov.i8 d17, #0x7
+; CHECK-NEXT: vuzp.8 d16, d18
+; CHECK-NEXT: vneg.s8 d17, d17
+; CHECK-NEXT: vshl.i8 d16, d16, #7
+; CHECK-NEXT: vmov d18, r2, r3
+; CHECK-NEXT: vshl.s8 d16, d16, d17
+; CHECK-NEXT: vbsl d16, d19, d18
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: mov pc, lr
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
-; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
-; CHECK: vuzp
-; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
@@ -345,10 +425,40 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1
}
define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
+; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
+; CHECK: @ BB#0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: add r12, sp, #24
+; CHECK-NEXT: add lr, sp, #8
+; CHECK-NEXT: vldr d20, .LCPI22_0
+; CHECK-NEXT: vld1.64 {d16, d17}, [lr]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT: vcgt.u32 q8, q9, q8
+; CHECK-NEXT: vmov d18, r2, r3
+; CHECK-NEXT: vmov d19, r0, r1
+; CHECK-NEXT: vmovn.i32 d16, q8
+; CHECK-NEXT: vmov.i8 d17, #0x7
+; CHECK-NEXT: vtbl.8 d16, {d16}, d20
+; CHECK-NEXT: vneg.s8 d17, d17
+; CHECK-NEXT: vshl.i8 d16, d16, #7
+; CHECK-NEXT: vshl.s8 d16, d16, d17
+; CHECK-NEXT: vbsl d16, d19, d18
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: .p2align 3
+; CHECK-NEXT: @ BB#1:
+; CHECK-NEXT: .LCPI22_0:
+; CHECK-NEXT: .byte 255 @ 0xff
+; CHECK-NEXT: .byte 255 @ 0xff
+; CHECK-NEXT: .byte 255 @ 0xff
+; CHECK-NEXT: .byte 255 @ 0xff
+; CHECK-NEXT: .byte 0 @ 0x0
+; CHECK-NEXT: .byte 2 @ 0x2
+; CHECK-NEXT: .byte 4 @ 0x4
+; CHECK-NEXT: .byte 6 @ 0x6
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
-; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
-; CHECK: vuzp
-; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
@@ -360,9 +470,79 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
; We're using large data types here, and we have to fill with undef values until we
; get some vector size that we can represent.
define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
+; CHECK-LABEL: vuzp_wide_type:
+; CHECK: @ BB#0:
+; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: .setfp r11, sp, #16
+; CHECK-NEXT: add r11, sp, #16
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, sp, #8
+; CHECK-NEXT: bic sp, sp, #15
+; CHECK-NEXT: add r5, r11, #52
+; CHECK-NEXT: add r7, r11, #32
+; CHECK-NEXT: add r4, r11, #44
+; CHECK-NEXT: add r6, r11, #24
+; CHECK-NEXT: add r12, r11, #60
+; CHECK-NEXT: add lr, r11, #40
+; CHECK-NEXT: vld1.32 {d17[0]}, [r7:32]
+; CHECK-NEXT: vld1.32 {d19[0]}, [r5:32]
+; CHECK-NEXT: vld1.32 {d22[0]}, [r12:32]
+; CHECK-NEXT: ldr r12, [r11, #64]
+; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32]
+; CHECK-NEXT: add r7, r11, #48
+; CHECK-NEXT: add r5, r11, #28
+; CHECK-NEXT: vld1.32 {d16[0]}, [r6:32]
+; CHECK-NEXT: vld1.32 {d18[0]}, [r4:32]
+; CHECK-NEXT: add r6, r11, #56
+; CHECK-NEXT: add r4, r11, #36
+; CHECK-NEXT: vcgt.u32 q10, q11, q10
+; CHECK-NEXT: vld1.32 {d19[1]}, [r6:32]
+; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32]
+; CHECK-NEXT: add r6, r12, #4
+; CHECK-NEXT: vld1.32 {d18[1]}, [r7:32]
+; CHECK-NEXT: vld1.32 {d16[1]}, [r5:32]
+; CHECK-NEXT: ldr r7, [r12]
+; CHECK-NEXT: vcgt.u32 q8, q9, q8
+; CHECK-NEXT: vmovn.i32 d18, q10
+; CHECK-NEXT: vmov.32 d21[0], r7
+; CHECK-NEXT: vmovn.i32 d16, q8
+; CHECK-NEXT: vmov.u8 r7, d21[3]
+; CHECK-NEXT: vmov.i8 d17, #0x7
+; CHECK-NEXT: vuzp.8 d16, d18
+; CHECK-NEXT: vmov.8 d23[0], r7
+; CHECK-NEXT: vneg.s8 d17, d17
+; CHECK-NEXT: add r7, r11, #8
+; CHECK-NEXT: vldr d18, .LCPI23_0
+; CHECK-NEXT: vld1.8 {d23[1]}, [r6]
+; CHECK-NEXT: vshl.i8 d16, d16, #7
+; CHECK-NEXT: vshl.s8 d20, d16, d17
+; CHECK-NEXT: vmov.i8 q8, #0x7
+; CHECK-NEXT: vneg.s8 q8, q8
+; CHECK-NEXT: vtbl.8 d22, {d20, d21}, d18
+; CHECK-NEXT: vld1.64 {d18, d19}, [r7]
+; CHECK-NEXT: vshl.i8 q10, q11, #7
+; CHECK-NEXT: vmov d23, r2, r3
+; CHECK-NEXT: vmov d22, r0, r1
+; CHECK-NEXT: vshl.s8 q8, q10, q8
+; CHECK-NEXT: vbsl q8, q11, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: sub sp, r11, #16
+; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: .p2align 3
+; CHECK-NEXT: @ BB#1:
+; CHECK-NEXT: .LCPI23_0:
+; CHECK-NEXT: .byte 0 @ 0x0
+; CHECK-NEXT: .byte 1 @ 0x1
+; CHECK-NEXT: .byte 2 @ 0x2
+; CHECK-NEXT: .byte 3 @ 0x3
+; CHECK-NEXT: .byte 4 @ 0x4
+; CHECK-NEXT: .byte 8 @ 0x8
+; CHECK-NEXT: .byte 9 @ 0x9
+; CHECK-NEXT: .byte 10 @ 0xa
<5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
-; CHECK-LABEL: vuzp_wide_type
-; CHECK: vbsl
%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
%c0 = icmp ult <5 x i32> %cmp0, %cmp1
@@ -370,3 +550,22 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
ret <10 x i8> %rv
}
+
+%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
+define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
+; CHECK-LABEL: vuzp_extract_subvector:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vorr d18, d17, d17
+; CHECK-NEXT: vuzp.8 d16, d18
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d18
+; CHECK-NEXT: mov pc, lr
+
+ %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+ ret %struct.uint8x8x2_t %.fca.0.1.insert
+}