diff options
Diffstat (limited to 'test/CodeGen')
44 files changed, 2643 insertions, 2970 deletions
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll index ae77f7e099db..412651c55678 100644 --- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefix=FALKOR --check-prefix=ALL ; rdar://11481771 ; rdar://13713797 @@ -16,6 +17,10 @@ entry: ; KRYO: movi v1.2d, #0000000000000000 ; KRYO: movi v2.2d, #0000000000000000 ; KRYO: movi v3.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 +; FALKOR: movi v1.2d, #0000000000000000 +; FALKOR: movi v2.2d, #0000000000000000 +; FALKOR: movi v3.2d, #0000000000000000 tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind ret void } @@ -47,6 +52,8 @@ define void @t4() nounwind ssp { ; CYCLONE: movi.2d v1, #0000000000000000 ; KRYO: movi v0.2d, #0000000000000000 ; KRYO: movi v1.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 +; FALKOR: movi v1.2d, #0000000000000000 tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind ret void } diff --git a/test/CodeGen/AArch64/store_merge_pair_offset.ll b/test/CodeGen/AArch64/store_merge_pair_offset.ll new file mode 100644 index 000000000000..a091f0fd911c --- /dev/null +++ b/test/CodeGen/AArch64/store_merge_pair_offset.ll @@ -0,0 +1,12 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -enable-misched=false -enable-post-misched=false -o - %s | FileCheck %s + +define i64 @test(i64* %a) nounwind { + ; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}} + ; CHECK-NOT: ldr + %p1 = getelementptr inbounds i64, i64* %a, i32 64 + %tmp1 = load i64, i64* %p1, align 2 + %p2 = getelementptr inbounds i64, i64* %a, i32 63 + %tmp2 = load i64, i64* %p2, align 2 + %tmp3 = add i64 %tmp1, %tmp2 + ret i64 %tmp3 +} diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll new file mode 100644 index 000000000000..8d8885852afe --- /dev/null +++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) +; GCN-NEXT: s_endpgm + +define amdgpu_gs void @main(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}main_halt: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT) +; GCN-NEXT: s_endpgm + +define void @main_halt(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}legacy: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) +; GCN-NEXT: s_endpgm + +define amdgpu_gs void @legacy(i32 inreg %a) #0 { + call void @llvm.SI.sendmsg(i32 3, i32 %a) + ret void +} + +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 +declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll new file mode 100644 index 000000000000..31f9cfca6def --- /dev/null +++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll @@ -0,0 +1,161 @@ +;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_interrupt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) +define void @test_interrupt() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_gs_emit() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_gs_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_gs_emit_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_done: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_gs_done() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) + ret void +} + + +; CHECK-LABEL: {{^}}test_interrupt_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT) +define void @test_interrupt_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_gs_emit_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_cut_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_gs_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_cut_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_gs_emit_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_done_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_gs_done_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0) + ret void +} + +; Legacy +; CHECK-LABEL: {{^}}test_legacy_interrupt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) +define void @test_legacy_interrupt() { +body: + call void @llvm.SI.sendmsg(i32 1, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_emit: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_legacy_gs_emit() { +body: + call void @llvm.SI.sendmsg(i32 34, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_legacy_gs_cut() { +body: + call void @llvm.SI.sendmsg(i32 274, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_legacy_gs_emit_cut() { +body: + call void @llvm.SI.sendmsg(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_done: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_legacy_gs_done() { +body: + call void @llvm.SI.sendmsg(i32 3, i32 0) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 +declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll deleted file mode 100644 index 2d4987643a2b..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) -; GCN-NEXT: s_endpgm - -define amdgpu_gs void @main(i32 inreg %a) #0 { - call void @llvm.SI.sendmsg(i32 3, i32 %a) - ret void -} - -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll deleted file mode 100644 index c4bb27676e7d..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll +++ /dev/null @@ -1,24 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) - -define void @main() { -main_body: - call void @llvm.SI.sendmsg(i32 34, i32 0); - call void @llvm.SI.sendmsg(i32 274, i32 0); - call void @llvm.SI.sendmsg(i32 562, i32 0); - call void @llvm.SI.sendmsg(i32 3, i32 0); - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/PowerPC/ppc64-blnop.ll b/test/CodeGen/PowerPC/ppc64-blnop.ll new file mode 100644 index 000000000000..2fe23f91c83d --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-blnop.ll @@ -0,0 +1,129 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s +; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS +; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS + +%class.T = type { [2 x i8] } + +define void @e_callee(%class.T* %this, i8* %c) { ret void } +define void @e_caller(%class.T* %this, i8* %c) { + call void @e_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_caller: +; CHECK: bl e_callee +; CHECK-NEXT: nop + +; CHECK-FS-LABEL: e_caller: +; CHECK-FS: bl e_callee +; CHECK-FS-NEXT: nop +} + +define void @e_scallee(%class.T* %this, i8* %c) section "different" { ret void } +define void @e_scaller(%class.T* %this, i8* %c) { + call void @e_scallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_scaller: +; CHECK: bl e_scallee +; CHECK-NEXT: nop +} + +define void @e_s2callee(%class.T* %this, i8* %c) { ret void } +define void @e_s2caller(%class.T* %this, i8* %c) section "different" { + call void @e_s2callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_s2caller: +; CHECK: bl e_s2callee +; CHECK-NEXT: nop +} + +$cd1 = comdat any +$cd2 = comdat any + +define void @e_ccallee(%class.T* %this, i8* %c) comdat($cd1) { ret void } +define void @e_ccaller(%class.T* %this, i8* %c) comdat($cd2) { + call void @e_ccallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_ccaller: +; CHECK: bl e_ccallee +; CHECK-NEXT: nop +} + +$cd = comdat any + +define void @e_c1callee(%class.T* %this, i8* %c) comdat($cd) { ret void } +define void @e_c1caller(%class.T* %this, i8* %c) comdat($cd) { + call void @e_c1callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_c1caller: +; CHECK: bl e_c1callee +; CHECK-NEXT: nop +} + +define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_hcaller(%class.T* %this, i8* %c) { + call void @wo_hcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_hcaller: +; CHECK: bl wo_hcallee +; CHECK-NEXT: nop +} + +define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_pcaller(%class.T* %this, i8* %c) { + call void @wo_pcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_pcaller: +; CHECK: bl wo_pcallee +; CHECK-NEXT: nop +} + +define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } +define void @wo_caller(%class.T* %this, i8* %c) { + call void @wo_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_caller: +; CHECK: bl wo_callee +; CHECK-NEXT: nop +} + +define weak protected void @w_pcallee(i8* %ptr) { ret void } +define void @w_pcaller(i8* %ptr) { + call void @w_pcallee(i8* %ptr) + ret void + +; CHECK-LABEL: w_pcaller: +; CHECK: bl w_pcallee +; CHECK-NEXT: nop +} + +define weak hidden void @w_hcallee(i8* %ptr) { ret void } +define void @w_hcaller(i8* %ptr) { + call void @w_hcallee(i8* %ptr) + ret void + +; CHECK-LABEL: w_hcaller: +; CHECK: bl w_hcallee +; CHECK-NEXT: nop +} + +define weak void @w_callee(i8* %ptr) { ret void } +define void @w_caller(i8* %ptr) { + call void @w_callee(i8* %ptr) + ret void + +; CHECK-LABEL: w_caller: +; CHECK: bl w_callee +; CHECK-NEXT: nop +} + diff --git a/test/CodeGen/PowerPC/ppc64-sibcall.ll b/test/CodeGen/PowerPC/ppc64-sibcall.ll index 418b7828f1d9..59e545601475 100644 --- a/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -142,7 +142,7 @@ define void @wo_hcaller(%class.T* %this, i8* %c) { ret void ; CHECK-SCO-LABEL: wo_hcaller: -; CHECK-SCO: b wo_hcallee +; CHECK-SCO: bl wo_hcallee } define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } @@ -151,7 +151,7 @@ define void @wo_pcaller(%class.T* %this, i8* %c) { ret void ; CHECK-SCO-LABEL: wo_pcaller: -; CHECK-SCO: b wo_pcallee +; CHECK-SCO: bl wo_pcallee } define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } @@ -169,7 +169,7 @@ define void @w_pcaller(i8* %ptr) { ret void ; CHECK-SCO-LABEL: w_pcaller: -; CHECK-SCO: b w_pcallee +; CHECK-SCO: bl w_pcallee } define weak hidden void @w_hcallee(i8* %ptr) { ret void } @@ -178,7 +178,7 @@ define void @w_hcaller(i8* %ptr) { ret void ; CHECK-SCO-LABEL: w_hcaller: -; CHECK-SCO: b w_hcallee +; CHECK-SCO: bl w_hcallee } define weak void @w_callee(i8* %ptr) { ret void } diff --git a/test/CodeGen/SPARC/soft-float.ll b/test/CodeGen/SPARC/soft-float.ll index 53ca1974659e..582804444f3b 100644 --- a/test/CodeGen/SPARC/soft-float.ll +++ b/test/CodeGen/SPARC/soft-float.ll @@ -45,21 +45,21 @@ define fp128 @test_multf3(fp128 %a, fp128 %b) #0 { } define float @test_subsf3(float %a, float %b) #0 { - ; CHCEK-LABEL: test_subsf3: + ; CHECK-LABEL: test_subsf3: ; CHECK: call __subsf3 %sub = fsub float %a, %b ret float %sub } define double @test_subdf3(double %a, double %b) #0 { - ; CHCEK-LABEL: test_subdf3: + ; CHECK-LABEL: test_subdf3: ; CHECK: call __subdf3 %sub = fsub double %a, %b ret double %sub } define fp128 @test_subtf3(fp128 %a, fp128 %b) #0 { - ; CHCEK-LABEL: test_subtf3: + ; CHECK-LABEL: test_subtf3: ; CHECK: call __subtf3 %sub = fsub fp128 %a, %b ret fp128 %sub diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll index b50253bf2b03..4d7cb765d7b9 100644 --- a/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -371,6 +371,40 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { } ; Make sure that we merge the consecutive load/store sequence below and use a +; word (16 bit) instead of a byte copy for complicated address calculation. +; . +; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: +; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] +; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] +; CHECK: movw %[[REG]], (%{{.*}}) +define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) { + br label %1 + +; <label>:1 + %.09 = phi i64 [ 0, %0 ], [ %13, %1 ] + %.08 = phi i8* [ %b, %0 ], [ %12, %1 ] + %2 = load i8, i8* %.08, align 1 + %3 = sext i8 %2 to i64 + %4 = getelementptr inbounds i8, i8* %c, i64 %3 + %5 = load i8, i8* %4, align 1 + %6 = add nsw i64 %3, 1 + %7 = getelementptr inbounds i8, i8* %c, i64 %6 + %8 = load i8, i8* %7, align 1 + %9 = getelementptr inbounds i8, i8* %a, i64 %.09 + store i8 %5, i8* %9, align 1 + %10 = or i64 %.09, 1 + %11 = getelementptr inbounds i8, i8* %a, i64 %10 + store i8 %8, i8* %11, align 1 + %12 = getelementptr inbounds i8, i8* %.08, i64 1 + %13 = add nuw nsw i64 %.09, 2 + %14 = icmp slt i64 %13, %n + br i1 %14, label %1, label %15 + +; <label>:15 + ret void +} + +; Make sure that we merge the consecutive load/store sequence below and use a ; word (16 bit) instead of a byte copy even if there are intermediate sign ; extensions. ; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext: diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 9b4d776b29e3..f65f485cc62c 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -209,34 +209,22 @@ entry: } define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp { -; X32-AVX2-LABEL: QQ64: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: movl (%eax), %ecx -; X32-AVX2-NEXT: movl 4(%eax), %eax -; X32-AVX2-NEXT: vmovd %ecx, %xmm0 -; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX2-NEXT: retl +; X32-LABEL: QQ64: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl ; ; X64-LABEL: QQ64: ; X64: ## BB#0: ## %entry ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq -; -; X32-AVX512VL-LABEL: QQ64: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: movl (%eax), %ecx -; X32-AVX512VL-NEXT: movl 4(%eax), %eax -; X32-AVX512VL-NEXT: vmovd %ecx, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512VL-NEXT: retl entry: %q = load i64, i64* %ptr, align 4 %q0 = insertelement <4 x i64> undef, i64 %q, i32 0 @@ -1105,55 +1093,30 @@ define <4 x double> @splat_concat4(double %d) { ; Those test cases exerce the latter. define void @isel_crash_16b(i8* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_16b: -; X32-AVX2: ## BB#0: ## %eintry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi0: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: isel_crash_16b: -; X64-AVX2: ## BB#0: ## %eintry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movb (%rdi), %al -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: isel_crash_16b: -; X32-AVX512VL: ## BB#0: ## %eintry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi0: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl +; X32-LABEL: isel_crash_16b: +; X32: ## BB#0: ## %eintry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi0: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpbroadcastb (%eax), %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: isel_crash_16b: -; X64-AVX512VL: ## BB#0: ## %eintry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movb (%rdi), %al -; X64-AVX512VL-NEXT: vmovd %eax, %xmm1 -; X64-AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_16b: +; X64: ## BB#0: ## %eintry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq eintry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1277,55 +1240,30 @@ eintry: } define void @isel_crash_8w(i16* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_8w: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi4: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: isel_crash_8w: -; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movw (%rdi), %ax -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: isel_crash_8w: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi4: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl +; X32-LABEL: isel_crash_8w: +; X32: ## BB#0: ## %entry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi4: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpbroadcastw (%eax), %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: isel_crash_8w: -; X64-AVX512VL: ## BB#0: ## %entry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movw (%rdi), %ax -; X64-AVX512VL-NEXT: vmovd %eax, %xmm1 -; X64-AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_8w: +; X64: ## BB#0: ## %entry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1605,24 +1543,24 @@ eintry: } define void @isel_crash_2q(i64* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_2q: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi12: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: movl (%eax), %ecx -; X32-AVX2-NEXT: movl 4(%eax), %eax -; X32-AVX2-NEXT: vmovd %ecx, %xmm1 -; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl +; X32-LABEL: isel_crash_2q: +; X32: ## BB#0: ## %entry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi12: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm1 +; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; ; X64-AVX2-LABEL: isel_crash_2q: ; X64-AVX2: ## BB#0: ## %entry @@ -1635,25 +1573,6 @@ define void @isel_crash_2q(i64* %cV_R.addr) { ; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-AVX2-NEXT: retq ; -; X32-AVX512VL-LABEL: isel_crash_2q: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi12: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: movl (%eax), %ecx -; X32-AVX512VL-NEXT: movl 4(%eax), %eax -; X32-AVX512VL-NEXT: vmovd %ecx, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl -; ; X64-AVX512VL-LABEL: isel_crash_2q: ; X64-AVX512VL: ## BB#0: ## %entry ; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 @@ -1752,7 +1671,7 @@ define void @isel_crash_4q(i64* %cV_R.addr) { ; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; X32-AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: movl %ebp, %esp diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll index 656b618eff55..87f8cc9a418e 100644 --- a/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/test/CodeGen/X86/avx512-any_extend_load.ll @@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { define void @any_extend_load_v8i32(<8 x i8> * %ptr) { ; KNL-LABEL: any_extend_load_v8i32: ; KNL: # BB#0: -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll index 9e8662452822..391bf6ba4554 100644 --- a/test/CodeGen/X86/avx512-extract-subvector.ll +++ b/test/CodeGen/X86/avx512-extract-subvector.ll @@ -60,7 +60,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind { define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f64_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi) +; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> @@ -72,7 +72,7 @@ entry: define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f32_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -84,7 +84,7 @@ entry: define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i64_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3> @@ -96,7 +96,7 @@ entry: define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i32_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -108,7 +108,7 @@ entry: define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i16_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> @@ -120,7 +120,7 @@ entry: define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v32i8_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index c6cc74289971..26d14fa0840f 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -463,7 +463,7 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { ; SKX-LABEL: extract_v4i64: ; SKX: ## BB#0: ; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) ; SKX-NEXT: retq %r1 = extractelement <4 x i64> %x, i32 1 @@ -521,7 +521,7 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { ; SKX-LABEL: extract_v8i32: ; SKX: ## BB#0: ; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) ; SKX-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 @@ -582,7 +582,7 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { ; SKX-LABEL: extract_v16i16: ; SKX: ## BB#0: ; SKX-NEXT: vpextrw $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SKX-NEXT: retq @@ -646,7 +646,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { ; SKX-LABEL: extract_v32i8: ; SKX: ## BB#0: ; SKX-NEXT: vpextrb $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SKX-NEXT: retq @@ -714,9 +714,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 -; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 @@ -780,9 +780,9 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 @@ -846,9 +846,9 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 @@ -912,9 +912,9 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 @@ -1014,9 +1014,9 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { ; ; SKX-LABEL: test_insert_128_v16i16: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %r = insertelement <16 x i16> %x, i16 %y, i32 10 ret <16 x i16> %r @@ -1032,9 +1032,9 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) { ; ; SKX-LABEL: test_insert_128_v32i8: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %r = insertelement <32 x i8> %x, i8 %y, i32 20 ret <32 x i8> %r diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index f422a0354988..3c649e18bc38 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2868,3 +2868,187 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask } declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextractf32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextracti64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 +; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vextracti32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) { +; CHECK-LABEL: test_vextractf64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) + +declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 5442693806f3..3015a2b499ff 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -1243,53 +1243,6 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone -define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextractf32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) - ret <4 x float> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) - -define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextracti64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti64x4 $2, %zmm1, %ymm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) - ret <4 x i64> %res -} - -declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) - -define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { -; CHECK-LABEL: test_maskz_vextracti32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) - ret <4 x i32> %res -} - -declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) - -define <4 x double> @test_vextractf64x4(<8 x double> %a) { -; CHECK-LABEL: test_vextractf64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: vextractf64x4 $2, %zmm0, %ymm0 -; CHECK-NEXT: retq - %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) - ret <4 x double> %res -} - -declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) - declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) @@ -3984,86 +3937,6 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<1 ret <16 x float> %res2 } -declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 -} - declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32) define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index a5bceb7670a0..2200f1159880 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -30,9 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) { ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpmovm2q %k0, %zmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: vpmovq2m %zmm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll index 09c48ddf81a1..ad8a29cacd82 100644 --- a/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -237,7 +237,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: PR29088: @@ -245,7 +245,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BWVL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512BWVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: PR29088: @@ -253,7 +253,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQVL-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi) -; X64-AVX512DQVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 6fd111577440..7a9d7d7885ff 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -30,7 +30,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -79,7 +79,7 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -129,7 +129,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -178,7 +178,7 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll new file mode 100644 index 000000000000..f4cf22c5ed3a --- /dev/null +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s + +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 +; CHECK-NEXT: kmovb %edi, %k0 +; CHECK-NEXT: kshiftlb $7, %k0, %k1 +; CHECK-NEXT: kshiftrb $7, %k1, %k1 +; CHECK-NEXT: kshiftlb $6, %k0, %k0 +; CHECK-NEXT: kshiftrb $7, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 +; CHECK-NEXT: vpsrad $31, %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vandpd %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) + %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 5826bb6fad23..375d63264517 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -325,127 +325,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou ret <2 x double> %res2 } - -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: retq - %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 -} - declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8) define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) { diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 9bf989df22a3..f8460bf880f9 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1560,3 +1560,62 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) + %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index eb9c6b64bcf6..3430c5715376 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -549,66 +549,6 @@ define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x f ret <8 x float> %res2 } -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 -} - -declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) - -define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 -} - -declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 -} - declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8) define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8d44af7b7a4c..c63d47d780d1 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4773,3 +4773,63 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, < ret <4 x float> %res4 } +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 94095f549e51..82014283246e 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3621,26 +3621,6 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ret <4 x i64> %res2 } -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { @@ -3709,47 +3689,6 @@ define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x ret <8 x float> %res2 } -declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xd9,0x01] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - -declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xd9,0x01] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 -} - declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll new file mode 100644 index 000000000000..ab797e04b400 --- /dev/null +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll @@ -0,0 +1,72 @@ +; Test ensuring debug intrinsics do not affect generated function prologue. +; +; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s + +@a = local_unnamed_addr global i64 0, align 8 + +define void @noDebug() { +entry: + %0 = load i64, i64* @a, align 8 + %1 = load i64, i64* @a, align 8 + %2 = load i64, i64* @a, align 8 + %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1) + %4 = extractvalue { i64, i1 } %3, 0 + %5 = tail call i64 @fn1(i64 %4, i64 %2) + tail call void (...) @printf() + tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5) + ret void +} + +; CHECK-LABEL: noDebug +; CHECK: addq $24, %rsp +; CHECK: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + + +define void @withDebug() !dbg !18 { +entry: + %0 = load i64, i64* @a, align 8 + %1 = load i64, i64* @a, align 8 + %2 = load i64, i64* @a, align 8 + %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1) + %4 = extractvalue { i64, i1 } %3, 0 + %5 = tail call i64 @fn1(i64 %4, i64 %2) + tail call void @llvm.dbg.value(metadata i64 %4, i64 0, metadata !23, metadata !33), !dbg !34 + tail call void @llvm.dbg.value(metadata i64 %5, i64 0, metadata !22, metadata !33), !dbg !35 + tail call void (...) @printf() + tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5) + ret void +} + +; CHECK-LABEL: withDebug +; CHECK: #DEBUG_VALUE: test:j <- %RBX +; CHECK-NEXT: addq $24, %rsp +; CHECK: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) +declare i64 @fn1(i64, i64) + +declare void @printf(...) + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!15, !16} + +!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang version 4.0.0") +!2 = !DIFile(filename: "test.cpp", directory: "") +!11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +!15 = !{i32 2, !"Dwarf Version", i32 4} +!16 = !{i32 2, !"Debug Info Version", i32 3} +!18 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 5, unit: !1) +!22 = !DILocalVariable(name: "i", scope: !18, file: !2, line: 6, type: !11) +!23 = !DILocalVariable(name: "j", scope: !18, file: !2, line: 7, type: !11) +!33 = !DIExpression() +!34 = !DILocation(line: 7, column: 17, scope: !18) +!35 = !DILocation(line: 6, column: 8, scope: !18) +!36 = !DILocation(line: 9, column: 3, scope: !18) +!37 = !DILocation(line: 10, column: 10, scope: !18) diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll index 8614d1b4c6c3..e86d094ac341 100644 --- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll @@ -2,31 +2,56 @@ ; ; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s - -define i64 @noDebug(i64 %a) { +define i64 @fn1NoDebug(i64 %a) { %call = call i64 @fn(i64 %a, i64 0) ret i64 %call } -; CHECK-LABEL: noDebug +; CHECK-LABEL: fn1NoDebug ; CHECK: popq %rcx -; CHECK: ret - +; CHECK-NEXT: ret -define i64 @withDebug(i64 %a) !dbg !4 { +define i64 @fn1WithDebug(i64 %a) !dbg !4 { %call = call i64 @fn(i64 %a, i64 0) tail call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !5, metadata !6), !dbg !7 ret i64 %call } -; CHECK-LABEL: withDebug +; CHECK-LABEL: fn1WithDebug ; CHECK: popq %rcx -; CHECK: ret +; CHECK-NEXT: ret + +%struct.Buffer = type { i8, [63 x i8] } + +define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) { + ret void +} + +; CHECK-LABEL: fn2NoDebug +; CHECK: and +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: mov +; CHECK-NEXT: pop +; CHECK-NEXT: ret + +define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !4 { + call void @llvm.dbg.declare(metadata %struct.Buffer* %p1, metadata !5, metadata !6), !dbg !7 + ret void +} +; CHECK-LABEL: fn2WithDebug +; CHECK: and +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: mov +; CHECK-NEXT: pop +; CHECK-NEXT: ret declare i64 @fn(i64, i64) declare void @llvm.dbg.value(metadata, i64, metadata, metadata) +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2,!3} diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll index 8898551a9764..da92bdb55d7c 100644 --- a/test/CodeGen/X86/i64-to-float.ll +++ b/test/CodeGen/X86/i64-to-float.ll @@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind { define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_sitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_sitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_sitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_sitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq @@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_uitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_uitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_uitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_uitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index cba9a221f774..4e65b169c7e6 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -1009,7 +1009,7 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; ; SKX-LABEL: one_mask_bit_set3: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vmovq %xmm0, 16(%rdi) ; SKX-NEXT: retq call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) @@ -1026,17 +1026,11 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512F-LABEL: one_mask_bit_set4: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi) -; AVX512F-NEXT: retq -; -; SKX-LABEL: one_mask_bit_set4: -; SKX: ## BB#0: -; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; SKX-NEXT: vmovhpd %xmm0, 24(%rdi) -; SKX-NEXT: retq +; AVX512-LABEL: one_mask_bit_set4: +; AVX512: ## BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX512-NEXT: retq call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>) ret void } @@ -1109,19 +1103,12 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_one_mask_bit_set3: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; SKX-LABEL: load_one_mask_bit_set3: -; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: load_one_mask_bit_set3: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val) ret <4 x i64> %res } @@ -1136,19 +1123,12 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: load_one_mask_bit_set4: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; SKX-LABEL: load_one_mask_bit_set4: -; SKX: ## BB#0: -; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1 -; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: load_one_mask_bit_set4: +; AVX512: ## BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val) ret <4 x double> %res } diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index 198a96df6b1f..c6ae85dda43a 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -488,7 +488,7 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) { define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { ;CHECK-LABEL: stack_fold_extractf32x4 - ;CHECK: vextractf32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <4 x float> %1 @@ -496,7 +496,7 @@ define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_extractf64x2 - ;CHECK: vextractf64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> <i32 2, i32 3> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <2 x double> %1 @@ -504,7 +504,7 @@ define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { ;CHECK-LABEL: stack_fold_insertf32x4 - ;CHECK: vinsertf32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x float> %2 @@ -512,7 +512,7 @@ define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_insertf64x2 - ;CHECK: vinsertf64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x double> %2 diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index 6847595e9278..77afc49b2576 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -445,7 +445,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16 define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) { ;CHECK-LABEL: stack_fold_extracti32x4 - ;CHECK: vextracti32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ; add forces execution domain %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -455,7 +455,7 @@ define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) { define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) { ;CHECK-LABEL: stack_fold_extracti64x2 - ;CHECK: vextracti64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ; add forces execution domain %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1> %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3> @@ -465,7 +465,7 @@ define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) { define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_inserti32x4 - ;CHECK: vinserti32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; add forces execution domain @@ -475,7 +475,7 @@ define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { ;CHECK-LABEL: stack_fold_inserti64x2 - ;CHECK: vinserti64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; add forces execution domain diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll index b4f7cb9e106d..7aa3f393bbed 100644 --- a/test/CodeGen/X86/subvector-broadcast.ll +++ b/test/CodeGen/X86/subvector-broadcast.ll @@ -832,7 +832,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: @@ -841,7 +841,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: @@ -850,7 +850,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: @@ -864,21 +864,21 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 store <2 x double> %1, <2 x double>* %p1 @@ -896,32 +896,14 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512BW-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: ; X64-AVX: ## BB#0: @@ -930,26 +912,12 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 store <2 x i64> %1, <2 x i64>* %p1 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -957,37 +925,21 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) } define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { -; X32-AVX-LABEL: test_broadcast_4f32_8f32_reuse: -; X32-AVX: ## BB#0: -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX-NEXT: retl -; -; X32-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: -; X32-AVX512: ## BB#0: -; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512-NEXT: retl -; -; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: -; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: retq -; -; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: -; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: retq +; X32-LABEL: test_broadcast_4f32_8f32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 +; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4f32_8f32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 store <4 x float> %1, <4 x float>* %p1 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> @@ -1010,7 +962,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: @@ -1024,7 +976,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x i32> %1, <4 x i32>* %p1 @@ -1048,7 +1000,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1057,7 +1009,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1066,7 +1018,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1080,21 +1032,21 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 store <8 x i16> %1, <8 x i16>* %p1 @@ -1118,7 +1070,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1127,7 +1079,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1136,7 +1088,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1150,21 +1102,21 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 store <16 x i8> %1, <16 x i8>* %p1 @@ -1194,7 +1146,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: @@ -1204,7 +1156,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: @@ -1214,7 +1166,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: @@ -1230,7 +1182,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: @@ -1238,7 +1190,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: @@ -1246,7 +1198,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -1349,6 +1301,44 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* @gb4 = global <8 x i64> zeroinitializer, align 8 define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { +; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X32-AVX1: ## BB#0: ## %entry +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X32-AVX1-NEXT: vmovups %ymm0, _ga4 +; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32 +; X32-AVX1-NEXT: vmovups %ymm1, _gb4 +; X32-AVX1-NEXT: vzeroupper +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X32-AVX2: ## BB#0: ## %entry +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vmovdqu %ymm0, _ga4 +; X32-AVX2-NEXT: vmovdqu %ymm2, _gb4+32 +; X32-AVX2-NEXT: vmovdqu %ymm1, _gb4 +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X32-AVX512: ## BB#0: ## %entry ; X32-AVX512-NEXT: vpaddq LCPI26_0, %ymm0, %ymm0 @@ -1359,6 +1349,45 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4 ; X32-AVX512-NEXT: retl ; +; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X64-AVX1: ## BB#0: ## %entry +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4] +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2] +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip) +; X64-AVX1-NEXT: vmovups %ymm2, _gb4+{{.*}}(%rip) +; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X64-AVX2: ## BB#0: ## %entry +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip) +; X64-AVX2-NEXT: vmovdqu %ymm2, _gb4+{{.*}}(%rip) +; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX512: ## BB#0: ## %entry ; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] @@ -1383,6 +1412,20 @@ entry: @gb2 = global <8 x double> zeroinitializer, align 8 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { +; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: +; X32-AVX: ## BB#0: ## %entry +; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vmovupd %ymm0, _ga2 +; X32-AVX-NEXT: vmovupd %ymm2, _gb2+32 +; X32-AVX-NEXT: vmovupd %ymm1, _gb2 +; X32-AVX-NEXT: vzeroupper +; X32-AVX-NEXT: retl +; ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X32-AVX512: ## BB#0: ## %entry ; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] @@ -1394,6 +1437,20 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) ; X32-AVX512-NEXT: vmovupd %zmm1, _gb2 ; X32-AVX512-NEXT: retl ; +; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: +; X64-AVX: ## BB#0: ## %entry +; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovupd %ymm2, _gb2+{{.*}}(%rip) +; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq +; ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X64-AVX512: ## BB#0: ## %entry ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index 2ced6de6aebe..2ad20a89cf26 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -204,7 +204,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; ; AVX512VL-LABEL: fptosi_4f64_to_4i64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] @@ -217,7 +217,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_4f64_to_4i64: @@ -719,7 +719,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; ; AVX512VL-LABEL: fptoui_4f64_to_4i64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] @@ -732,7 +732,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f64_to_4i64: @@ -1097,7 +1097,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_4f32_to_4i64: @@ -1205,7 +1205,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_8f32_to_4i64: @@ -1822,7 +1822,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f32_to_4i64: @@ -2000,7 +2000,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_8f32_to_4i64: @@ -2409,125 +2409,29 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-NEXT: popq %r14 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f128_to_4i32: -; VEX: # BB#0: -; VEX-NEXT: pushq %r14 -; VEX-NEXT: pushq %rbx -; VEX-NEXT: subq $24, %rsp -; VEX-NEXT: movq %rsi, %r14 -; VEX-NEXT: movq %rdi, %rbx -; VEX-NEXT: movq %rdx, %rdi -; VEX-NEXT: movq %rcx, %rsi -; VEX-NEXT: callq __fixtfdi -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; VEX-NEXT: movq %rbx, %rdi -; VEX-NEXT: movq %r14, %rsi -; VEX-NEXT: callq __fixtfdi -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; VEX-NEXT: # xmm0 = xmm0[0],mem[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; VEX-NEXT: addq $24, %rsp -; VEX-NEXT: popq %rbx -; VEX-NEXT: popq %r14 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f128_to_4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $24, %rsp -; AVX512F-NEXT: movq %rsi, %r14 -; AVX512F-NEXT: movq %rdi, %rbx -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rcx, %rsi -; AVX512F-NEXT: callq __fixtfdi -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: movq %rbx, %rdi -; AVX512F-NEXT: movq %r14, %rsi -; AVX512F-NEXT: callq __fixtfdi -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512F-NEXT: addq $24, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f128_to_4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: pushq %r14 -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: movq %rsi, %r14 -; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: movq %rdx, %rdi -; AVX512VL-NEXT: movq %rcx, %rsi -; AVX512VL-NEXT: callq __fixtfdi -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: movq %rbx, %rdi -; AVX512VL-NEXT: movq %r14, %rsi -; AVX512VL-NEXT: callq __fixtfdi -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VL-NEXT: addq $24, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: popq %r14 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f128_to_4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: pushq %r14 -; AVX512DQ-NEXT: pushq %rbx -; AVX512DQ-NEXT: subq $24, %rsp -; AVX512DQ-NEXT: movq %rsi, %r14 -; AVX512DQ-NEXT: movq %rdi, %rbx -; AVX512DQ-NEXT: movq %rdx, %rdi -; AVX512DQ-NEXT: movq %rcx, %rsi -; AVX512DQ-NEXT: callq __fixtfdi -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512DQ-NEXT: movq %rbx, %rdi -; AVX512DQ-NEXT: movq %r14, %rsi -; AVX512DQ-NEXT: callq __fixtfdi -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512DQ-NEXT: addq $24, %rsp -; AVX512DQ-NEXT: popq %rbx -; AVX512DQ-NEXT: popq %r14 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f128_to_4i32: -; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: pushq %r14 -; AVX512VLDQ-NEXT: pushq %rbx -; AVX512VLDQ-NEXT: subq $24, %rsp -; AVX512VLDQ-NEXT: movq %rsi, %r14 -; AVX512VLDQ-NEXT: movq %rdi, %rbx -; AVX512VLDQ-NEXT: movq %rdx, %rdi -; AVX512VLDQ-NEXT: movq %rcx, %rsi -; AVX512VLDQ-NEXT: callq __fixtfdi -; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VLDQ-NEXT: movq %rbx, %rdi -; AVX512VLDQ-NEXT: movq %r14, %rsi -; AVX512VLDQ-NEXT: callq __fixtfdi -; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VLDQ-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VLDQ-NEXT: addq $24, %rsp -; AVX512VLDQ-NEXT: popq %rbx -; AVX512VLDQ-NEXT: popq %r14 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f128_to_4i32: +; AVX: # BB#0: +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: movq %rsi, %r14 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rcx, %rsi +; AVX-NEXT: callq __fixtfdi +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: movq %r14, %rsi +; AVX-NEXT: callq __fixtfdi +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x i32> %ext diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 58d7f7bf3d83..6a81cdc490fe 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -288,7 +288,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: sitofp_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -299,7 +299,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f64: @@ -821,7 +821,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -832,7 +832,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f64: @@ -1430,7 +1430,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2344,7 +2344,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2775,7 +2775,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -2786,7 +2786,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: @@ -3190,7 +3190,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -3201,7 +3201,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: @@ -3426,7 +3426,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3667,7 +3667,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: @@ -4013,7 +4013,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -4593,7 +4593,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 31eb2202a05e..5bf6fbeb6235 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -461,7 +461,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> @@ -757,7 +757,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; ; AVX512VL-LABEL: cvt_16i16_to_16f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm10 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: movq %rax, %rcx ; AVX512VL-NEXT: shrq $48, %rcx @@ -840,14 +840,14 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq %1 = bitcast <16 x i16> %a0 to <16 x half> @@ -1227,7 +1227,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -1491,14 +1491,14 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a0 @@ -1738,7 +1738,7 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x double> @@ -1929,7 +1929,7 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -2145,14 +2145,14 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> @@ -2350,7 +2350,7 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -2474,7 +2474,7 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -2643,14 +2643,14 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 @@ -3182,7 +3182,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: orl %edx, %eax ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %rcx, %rax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovd %xmm1, %ecx @@ -3427,7 +3427,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -3458,7 +3458,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -3479,7 +3479,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> @@ -3958,7 +3958,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovd %xmm1, %r10d -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovd %xmm2, %r11d @@ -4191,9 +4191,9 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; ; AVX512VL-LABEL: store_cvt_16f32_to_16i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextractf32x4 $1, %ymm2, %xmm3 +; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4 ; AVX512VL-NEXT: vmovd %xmm4, %eax ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4 @@ -4422,7 +4422,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4572,7 +4572,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4726,7 +4726,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4969,7 +4969,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d ; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4994,7 +4994,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5188,7 +5188,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r14d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5357,7 +5357,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5528,7 +5528,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5775,7 +5775,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5787,7 +5787,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r12d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll index 3ad13e03dbde..c68395493023 100644 --- a/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/test/CodeGen/X86/vector-lzcnt-256.ll @@ -710,35 +710,20 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VLCD-LABEL: testv32i8: -; AVX512VLCD: ## BB#0: -; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VLCD-NEXT: retq -; -; AVX512CD-LABEL: testv32i8: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512CD-NEXT: retq +; AVX512-LABEL: testv32i8: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # BB#0: @@ -799,35 +784,20 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VLCD-LABEL: testv32i8u: -; AVX512VLCD: ## BB#0: -; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VLCD-NEXT: retq -; -; AVX512CD-LABEL: testv32i8u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512CD-NEXT: retq +; AVX512-LABEL: testv32i8u: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index ba47740cbaf0..3c7fd8b51a02 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1789,25 +1789,15 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11> ret <16 x i16> %shuffle } @@ -1822,23 +1812,14 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9> ret <16 x i16> %shuffle } @@ -1885,23 +1866,14 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -1919,29 +1891,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -1957,25 +1917,15 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11> ret <16 x i16> %shuffle } @@ -1991,25 +1941,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15> ret <16 x i16> %shuffle } @@ -2026,27 +1966,16 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2062,25 +1991,15 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -2095,23 +2014,14 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2128,27 +2038,16 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2164,25 +2063,15 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15> ret <16 x i16> %shuffle } @@ -2210,12 +2099,12 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0 ; ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8> ret <16 x i16> %shuffle @@ -2232,25 +2121,15 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8> ret <16 x i16> %shuffle } @@ -2266,25 +2145,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8> ret <16 x i16> %shuffle } @@ -2300,25 +2169,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12> ret <16 x i16> %shuffle } @@ -2334,25 +2193,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8> ret <16 x i16> %shuffle } @@ -2368,25 +2217,15 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12> ret <16 x i16> %shuffle } @@ -2414,12 +2253,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11> ret <16 x i16> %shuffle @@ -2448,12 +2287,12 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11> ret <16 x i16> %shuffle @@ -2482,12 +2321,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13> ret <16 x i16> %shuffle @@ -2516,12 +2355,12 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11> ret <16 x i16> %shuffle @@ -2538,25 +2377,15 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2572,25 +2401,15 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2606,25 +2425,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2640,25 +2449,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -2675,27 +2474,16 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15> ret <16 x i16> %shuffle } @@ -2711,25 +2499,15 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2745,25 +2523,15 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2779,25 +2547,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2848,13 +2606,13 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11> ret <16 x i16> %shuffle @@ -2926,7 +2684,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] @@ -2934,7 +2692,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11> ret <16 x i16> %shuffle @@ -2961,13 +2719,13 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3] ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15> ret <16 x i16> %shuffle @@ -2996,12 +2754,12 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13> ret <16 x i16> %shuffle @@ -3693,23 +3451,14 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12> ret <16 x i16> %shuffle } @@ -3809,23 +3558,14 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10> ret <16 x i16> %shuffle } @@ -4000,17 +3740,11 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: -; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19> ret <16 x i16> %shuffle } @@ -4023,19 +3757,12 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ret <16 x i16> %shuffle } @@ -4049,17 +3776,11 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -4091,19 +3812,12 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index d4ec55a85d8d..301e8079a5dc 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -741,17 +741,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> ret <32 x i8> %shuffle } @@ -1167,19 +1161,12 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> ret <32 x i8> %shuffle } @@ -1706,17 +1693,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> ret <32 x i8> %shuffle } @@ -1787,19 +1768,12 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX2: # BB#0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47> ret <32 x i8> %shuffle } @@ -2188,7 +2162,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> ret <32 x i8> %shuffle @@ -2203,17 +2177,11 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> ret <32 x i8> %shuffle } @@ -2277,7 +2245,7 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_ ; ; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 3ecfc29d0f01..7f978138719e 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -320,39 +320,19 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0145: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0145: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0145: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_4501: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_4501: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_4501: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x double> %shuffle } @@ -367,23 +347,11 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_1054: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_1054: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_1054: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_1054: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> ret <4 x double> %shuffle } @@ -735,7 +703,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -808,7 +776,7 @@ define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0145: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i64> %shuffle @@ -852,7 +820,7 @@ define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_4501: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x i64> %shuffle @@ -948,7 +916,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_1054: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> @@ -1424,7 +1392,7 @@ define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) { ; ; AVX512VL-LABEL: concat_v4i64_0145_bc: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1> %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index d6e91ca25d75..cba15827d32c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -753,17 +753,11 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_3210ba98: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_3210ba98: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_3210ba98: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> ret <8 x float> %shuffle } @@ -829,17 +823,11 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_ba983210: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_ba983210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf64x2 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_ba983210: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0> ret <8 x float> %shuffle } @@ -863,17 +851,11 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu1111: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu1111: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1111: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1> ret <8 x float> %shuffle } @@ -885,17 +867,11 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_44444444: -; AVX2: # BB#0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_44444444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vbroadcastss %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_44444444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x float> %shuffle } @@ -910,33 +886,21 @@ define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu3210: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu3210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu3210: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu1188: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu1188: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1188: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8> ret <8 x float> %shuffle } @@ -951,17 +915,11 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_5555uuuu: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_5555uuuu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_5555uuuu: +; ALL: # BB#0: +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x float> %shuffle } @@ -1041,17 +999,11 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00040000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00040000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1064,17 +1016,11 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00500000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00500000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1087,17 +1033,11 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_06000000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_06000000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1142,17 +1082,11 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00112233: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00112233: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00112233: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x i32> %shuffle } @@ -1556,17 +1490,11 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00015444: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00015444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00015444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1577,17 +1505,11 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00204644: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00204644: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00204644: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1598,17 +1520,11 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_03004474: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_03004474: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_03004474: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x i32> %shuffle } @@ -1619,17 +1535,11 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10004444: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10004444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10004444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1640,17 +1550,11 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_22006446: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_22006446: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_22006446: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x i32> %shuffle } @@ -1661,17 +1565,11 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_33307474: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_33307474: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_33307474: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x i32> %shuffle } @@ -1682,17 +1580,11 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_32104567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_32104567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_32104567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1703,17 +1595,11 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00236744: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00236744: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00236744: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1724,17 +1610,11 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00226644: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00226644: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00226644: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1745,17 +1625,11 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10324567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10324567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10324567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1766,17 +1640,11 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_11334567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_11334567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_11334567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1787,17 +1655,11 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235467: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235467: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235467: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1808,17 +1670,11 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235466: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235466: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235466: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x i32> %shuffle } @@ -1829,17 +1685,11 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_002u6u44: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_002u6u44: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1850,17 +1700,11 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00uu66uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00uu66uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1871,17 +1715,11 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_103245uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_103245uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1892,17 +1730,11 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_1133uu67: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_1133uu67: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1913,17 +1745,11 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_0uu354uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_0uu354uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1934,17 +1760,11 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_uuu3uu66: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_uuu3uu66: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x i32> %shuffle } @@ -2038,17 +1858,11 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_3210ba98: -; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_3210ba98: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_3210ba98: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> ret <8 x i32> %shuffle } @@ -2185,17 +1999,11 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_uuuu1111: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_uuuu1111: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1> ret <8 x i32> %shuffle } @@ -2233,7 +2041,7 @@ define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_44444444: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> @@ -2247,17 +2055,11 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_5555uuuu: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_5555uuuu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 7f7c27af47b3..b951bf1c97ed 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW target triple = "x86_64-unknown-unknown" @@ -35,7 +35,7 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; ALL-NEXT: vxorps %zmm1, %zmm1, %zmm1 ; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16> @@ -82,7 +82,7 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vxorps %zmm0, %zmm0, %zmm0 ; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31> @@ -262,8 +262,8 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovupd (%rsi), %zmm0 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovups (%rsi), %zmm0 +; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 @@ -397,8 +397,8 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15 ret <16 x i32> %res } -define <16 x i32> @mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: ; ALL: # BB#0: ; ALL-NEXT: kmovw %edi, %k1 ; ALL-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] @@ -422,8 +422,8 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ret <16 x i32> %res } -define <16 x i32> @maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, i16 %mask) { -; ALL-LABEL: maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: ; ALL: # BB#0: ; ALL-NEXT: kmovw %edi, %k1 ; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] @@ -495,3 +495,55 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y ret <16 x i32> %res } + +define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru + ret <16 x i32> %res +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 625681dc294c..365ff3bf63d5 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2375,3 +2375,199 @@ define <8 x i64> @maskz_shuffle_v8i64_12345670(<8 x i64> %a, i8 %mask) { %res = select <8 x i1> %mask.cast, <8 x i64> %shuffle, <8 x i64> zeroinitializer ret <8 x i64> %res } + +define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_012389AB: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_012389AB: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_89AB0123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_89AB0123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01230123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01230123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_012389AB: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_012389AB: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_89AB0123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_89AB0123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01230123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01230123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_89234567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_89234567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01894567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01894567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01238967: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01238967: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01234589: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01234589: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_89234567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_89234567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01894567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01894567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01238967: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01238967: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01234589: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01234589: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> + ret <8 x i64> %shuffle +} diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index 1dcfd3223c86..f828ed0ba6e7 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -419,40 +419,31 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -462,52 +453,39 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm0, %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: paddw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -515,14 +493,14 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -532,41 +510,38 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -576,17 +551,6 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm5 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm6 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -603,50 +567,37 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -666,37 +617,35 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -706,10 +655,6 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -718,31 +663,27 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -752,13 +693,14 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -768,56 +710,54 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1676,69 +1616,39 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX1-LABEL: trunc_mul_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: @@ -1757,46 +1667,17 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm0, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: paddq %xmm8, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1808,111 +1689,68 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v8i64_v8i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: @@ -2186,104 +2024,60 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-LABEL: trunc_mul_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8 -; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9 -; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8 -; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512BW-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -2479,70 +2273,25 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> ret <4 x i32> %2 @@ -2551,36 +2300,6 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2592,64 +2311,28 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5] -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7] -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -2657,37 +2340,15 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> ret <8 x i16> %2 @@ -2696,55 +2357,38 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2907,34 +2551,12 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11] -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -2943,8 +2565,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -2955,50 +2579,30 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: retq @@ -3073,15 +2677,15 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper @@ -3547,36 +3151,31 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3586,30 +3185,23 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -3620,13 +3212,12 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3634,14 +3225,14 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3651,40 +3242,38 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3694,41 +3283,27 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm5 -; SSE-NEXT: pand {{.*}}(%rip), %xmm6 -; SSE-NEXT: pand {{.*}}(%rip), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: packuswb %xmm5, %xmm4 ; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -3749,15 +3324,12 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -3777,37 +3349,35 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3817,10 +3387,6 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -3829,12 +3395,11 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -3845,13 +3410,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -3861,13 +3425,14 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3877,55 +3442,54 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4323,36 +3887,31 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4362,30 +3921,23 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm0, %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -4396,13 +3948,12 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4410,14 +3961,14 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4427,40 +3978,38 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4470,17 +4019,6 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm5 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm6 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -4497,14 +4035,11 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -4525,15 +4060,12 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -4553,37 +4085,35 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4593,10 +4123,6 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -4605,12 +4131,11 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -4621,13 +4146,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4637,13 +4161,14 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4653,55 +4178,54 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5099,36 +4623,31 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: orps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -5138,30 +4657,23 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: orpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -5172,13 +4684,12 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5186,14 +4697,14 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -5203,40 +4714,38 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -5246,17 +4755,6 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: por {{.*}}(%rip), %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm5 -; SSE-NEXT: por {{.*}}(%rip), %xmm6 -; SSE-NEXT: por {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -5273,14 +4771,11 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -5301,15 +4796,12 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -5329,37 +4821,35 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -5369,10 +4859,6 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -5381,12 +4867,11 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -5397,13 +4882,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -5413,13 +4897,14 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -5429,55 +4914,54 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5488,49 +4972,204 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; complex patterns - often created by vectorizer ; -define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { -; SSE-LABEL: mul_add_v4i64_v4i32: +define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_const_v4i64_v4i32: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 ; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: mul_add_const_v4i64_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_add_const_v4i64_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: mul_add_const_v4i64_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mul_add_const_v4i64_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_self_v4i64_v4i32: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm0, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: mul_add_self_v4i64_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_add_self_v4i64_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: mul_add_self_v4i64_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mul_add_self_v4i64_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm2, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm6, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm1, %xmm2 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: mul_add_v4i64_v4i32: +; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] @@ -5538,58 +5177,58 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[0,2] ; AVX1-NEXT: retq ; -; AVX2-LABEL: mul_add_v4i64_v4i32: +; AVX2-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: mul_add_v4i64_v4i32: +; AVX512F-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512F-NEXT: retq ; -; AVX512BW-LABEL: mul_add_v4i64_v4i32: +; AVX512BW-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512BW-NEXT: retq ; -; AVX512DQ-LABEL: mul_add_v4i64_v4i32: +; AVX512DQ-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512DQ-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 - %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> + %4 = add <4 x i64> %1, %3 %5 = trunc <4 x i64> %4 to <4 x i32> ret <4 x i32> %5 } diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index 8a826e025a33..2571a21ce218 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -643,7 +643,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) ; AVX512VL-NEXT: retq ; @@ -701,7 +701,7 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; AVX512VL: # BB#0: # %entry ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2x4i64_8i32: @@ -717,7 +717,7 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i32> |