diff options
Diffstat (limited to 'test/CodeGen/X86')
35 files changed, 1937 insertions, 462 deletions
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index 172a00a7c86f..89cb71a52c04 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm" +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm" ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s ; rdar://6627786 ; rdar://7792037 diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll index f1ffc15f4d03..870e812bbb69 100644 --- a/test/CodeGen/X86/GlobalISel/memop-vec.ll +++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -1,39 +1,116 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { -; ALL-LABEL: test_load_v4i32_noalign: -; ALL: # BB#0: -; ALL-NEXT: vmovups (%rdi), %xmm0 -; ALL-NEXT: retq +; SKX-LABEL: test_load_v4i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %xmm0 +; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 1 ret <4 x i32> %r } define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { -; ALL-LABEL: test_load_v4i32_align: -; ALL: # BB#0: -; ALL-NEXT: vmovaps (%rdi), %xmm0 -; ALL-NEXT: retq +; SKX-LABEL: test_load_v4i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %xmm0 +; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 16 ret <4 x i32> %r } +define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) { +; SKX-LABEL: test_load_v8i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %ymm0 +; SKX-NEXT: retq + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r +} + +define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) { +; SKX-LABEL: test_load_v8i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %ymm0 +; SKX-NEXT: retq + %r = load <8 x i32>, <8 x i32>* %p1, align 32 + ret <8 x i32> %r +} + +define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) { +; SKX-LABEL: test_load_v16i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: retq + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r +} + +define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) { +; SKX-LABEL: test_load_v16i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: retq + %r = load <16 x i32>, <16 x i32>* %p1, align 32 + ret <16 x i32> %r +} + define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { -; ALL-LABEL: test_store_v4i32_noalign: -; ALL: # BB#0: -; ALL-NEXT: vmovups %xmm0, (%rdi) -; ALL-NEXT: retq +; SKX-LABEL: test_store_v4i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 1 ret void } define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { -; ALL-LABEL: test_store_v4i32_align: -; ALL: # BB#0: -; ALL-NEXT: vmovaps %xmm0, (%rdi) -; ALL-NEXT: retq +; SKX-LABEL: test_store_v4i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 16 ret void } + +define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { +; SKX-LABEL: test_store_v8i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { +; SKX-LABEL: test_store_v8i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <8 x i32> %val, <8 x i32>* %p1, align 32 + ret void +} + +define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { +; SKX-LABEL: test_store_v16i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { +; SKX-LABEL: test_store_v16i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <16 x i32> %val, <16 x i32>* %p1, align 64 + ret void +} + diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir index f925c836f3d1..cc03f3a57f0b 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir @@ -14,7 +14,16 @@ ret void } -... + define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r + } + + define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void + } + --- name: test_mul_vec256 alignment: 4 @@ -84,3 +93,47 @@ body: | RET 0 ... +--- +name: test_load_v8i32_noalign +# CHECK-LABEL: name: test_load_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: vecr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_store_v8i32_noalign +# CHECK-LABEL: name: test_store_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir index e0c12ff44a2f..278413ad38ef 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir @@ -15,22 +15,29 @@ ret void } + define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r + } + + define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void + } + ... --- name: test_mul_vec512 +# CHECK-LABEL: name: test_mul_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_mul_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): @@ -41,19 +48,16 @@ body: | ... --- name: test_add_vec512 +# CHECK-LABEL: name: test_add_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_add_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): @@ -64,24 +68,65 @@ body: | ... --- name: test_sub_vec512 +# CHECK-LABEL: name: test_sub_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_sub_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): %0(<16 x s32>) = IMPLICIT_DEF %1(<16 x s32>) = G_SUB %0, %0 RET 0 +... +--- + +name: test_load_v16i32_noalign +# CHECK-LABEL: name: test_load_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: vecr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_store_v16i32_noalign +# CHECK-LABEL: name: test_store_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1) + RET 0 ... diff --git a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir new file mode 100644 index 000000000000..539520c0b8f5 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir @@ -0,0 +1,96 @@ +# RUN: llc -mtriple=i586-linux-gnu -mcpu=haswell -mattr=-slow-incdec -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK +# +# This is necessary to test that attribute-based rule predicates work and that +# they properly reset between functions. + +--- | + define i32 @const_i32_1() { + ret i32 1 + } + + define i32 @const_i32_1_optsize() #0 { + ret i32 1 + } + + define i32 @const_i32_1b() { + ret i32 1 + } + + define i32 @const_i32_1_optsizeb() #0 { + ret i32 1 + } + + attributes #0 = { optsize } +... +--- +name: const_i32_1 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32ri 1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1_optsize +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1_optsize +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32r1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1b +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1b +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32ri 1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1_optsizeb +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1_optsizeb +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32r1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir new file mode 100644 index 000000000000..b9a7e4a8cc4a --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir @@ -0,0 +1,188 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + + +--- | + define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r + } + + define <8 x i32> @test_load_v8i32_align(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 32 + ret <8 x i32> %r + } + + define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void + } + + define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 32 + ret void + } + + +... +--- +name: test_load_v8i32_noalign +# ALL-LABEL: name: test_load_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: gr64 } +# NO_AVX512F-NEXT: - { id: 1, class: vr256 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: gr64 } +# AVX512ALL-NEXT: - { id: 1, class: vr256x } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# NO_AVX512F: %0 = COPY %rdi +# NO_AVX512F-NEXT: %1 = VMOVUPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# NO_AVX512F-NEXT: %ymm0 = COPY %1 +# NO_AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# AVX512F-NEXT: %ymm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512VL: %0 = COPY %rdi +# AVX512VL-NEXT: %1 = VMOVUPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# AVX512VL-NEXT: %ymm0 = COPY %1 +# AVX512VL-NEXT: RET 0, implicit %ymm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_load_v8i32_align +# ALL-LABEL: name: test_load_v8i32_align +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: gr64 } +# NO_AVX512F-NEXT: - { id: 1, class: vr256 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: gr64 } +# AVX512ALL-NEXT: - { id: 1, class: vr256x } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# NO_AVX512F: %0 = COPY %rdi +# NO_AVX512F-NEXT: %1 = VMOVAPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# NO_AVX512F-NEXT: %ymm0 = COPY %1 +# NO_AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVAPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# AVX512F-NEXT: %ymm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512VL: %0 = COPY %rdi +# AVX512VL-NEXT: %1 = VMOVAPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# AVX512VL-NEXT: %ymm0 = COPY %1 +# AVX512VL-NEXT: RET 0, implicit %ymm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_store_v8i32_noalign +# ALL-LABEL: name: test_store_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: vr256 } +# NO_AVX512F-NEXT: - { id: 1, class: gr64 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: vr256x } +# AVX512ALL-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# NO_AVX512F: %0 = COPY %ymm0 +# NO_AVX512F-NEXT: %1 = COPY %rdi +# NO_AVX512F-NEXT: VMOVUPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# NO_AVX512F-NEXT: RET 0 +# +# AVX512F: %0 = COPY %ymm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# AVX512F-NEXT: RET 0 +# +# AVX512VL: %0 = COPY %ymm0 +# AVX512VL-NEXT: %1 = COPY %rdi +# AVX512VL-NEXT: VMOVUPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# AVX512VL-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1) + RET 0 + +... +--- +name: test_store_v8i32_align +# ALL-LABEL: name: test_store_v8i32_align +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: vr256 } +# NO_AVX512F-NEXT: - { id: 1, class: gr64 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: vr256x } +# AVX512ALL-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# NO_AVX512F: %0 = COPY %ymm0 +# NO_AVX512F-NEXT: %1 = COPY %rdi +# NO_AVX512F-NEXT: VMOVAPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# NO_AVX512F-NEXT: RET 0 +# +# AVX512F: %0 = COPY %ymm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVAPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# AVX512F-NEXT: RET 0 +# +# AVX512VL: %0 = COPY %ymm0 +# AVX512VL-NEXT: %1 = COPY %rdi +# AVX512VL-NEXT: VMOVAPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# AVX512VL-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir new file mode 100644 index 000000000000..87978a684d4c --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir @@ -0,0 +1,127 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512F +--- | + define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r + } + + define <16 x i32> @test_load_v16i32_align(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 32 + ret <16 x i32> %r + } + + define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void + } + + define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 32 + ret void + } + +... +--- +name: test_load_v16i32_noalign +# AVX512F-LABEL: name: test_load_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: gr64 } +# AVX512F-NEXT: - { id: 1, class: vr512 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 1) +# AVX512F-NEXT: %zmm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %zmm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_load_v16i32_align +# AVX512F-LABEL: name: test_load_v16i32_align +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: gr64 } +# AVX512F-NEXT: - { id: 1, class: vr512 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 32) +# AVX512F-NEXT: %zmm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %zmm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 32) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_store_v16i32_noalign +# AVX512F-LABEL: name: test_store_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: vr512 } +# AVX512F-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# AVX512F: %0 = COPY %zmm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 1) +# AVX512F-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1) + RET 0 + +... +--- +name: test_store_v16i32_align +# AVX512F-LABEL: name: test_store_v16i32_align +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: vr512 } +# AVX512F-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# AVX512F: %0 = COPY %zmm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 32) +# AVX512F-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 32) + RET 0 + +... diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll index cf514d7aeb31..016ddb9c5e78 100644 --- a/test/CodeGen/X86/avx-vzeroupper.ll +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,10 +1,8 @@ -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s - -; FAST-YMM-ZMM-NOT: vzeroupper -; BTVER2-NOT: vzeroupper +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-YMM-ZMM +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 declare i32 @foo() declare <4 x float> @do_sse(<4 x float>) @@ -15,43 +13,86 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind ;; Basic checking - don't emit any vzeroupper instruction -; CHECK: _test00 -define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { -entry: - ; CHECK-NOT: vzeroupper +define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind { +; ALL-LABEL: test00: +; ALL: # BB#0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL-NEXT: callq do_sse +; ALL-NEXT: popq %rax +; ALL-NEXT: retq %add.i = fadd <4 x float> %a, %b %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind - ; CHECK: ret ret <4 x float> %call3 } ;; Check parameter 256-bit parameter passing -; CHECK: _test01 -define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp { -entry: +define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { +; VZ-LABEL: test01: +; VZ: # BB#0: +; VZ-NEXT: subq $56, %rsp +; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; VZ-NEXT: vzeroupper +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; VZ-NEXT: addq $56, %rsp +; VZ-NEXT: retq +; +; FAST-YMM-ZMM-LABEL: test01: +; FAST-YMM-ZMM: # BB#0: +; FAST-YMM-ZMM-NEXT: subq $56, %rsp +; FAST-YMM-ZMM-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; FAST-YMM-ZMM-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; FAST-YMM-ZMM-NEXT: addq $56, %rsp +; FAST-YMM-ZMM-NEXT: retq +; +; BTVER2-LABEL: test01: +; BTVER2: # BB#0: +; BTVER2-NEXT: subq $56, %rsp +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; BTVER2-NEXT: addq $56, %rsp +; BTVER2-NEXT: retq %tmp = load <4 x float>, <4 x float>* @x, align 16 - ; CHECK: vzeroupper - ; CHECK-NEXT: callq _do_sse %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind store <4 x float> %call, <4 x float>* @x, align 16 - ; CHECK-NOT: vzeroupper - ; CHECK: callq _do_sse %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind store <4 x float> %call2, <4 x float>* @x, align 16 - ; CHECK: ret ret <8 x float> %c } ;; Check that vzeroupper is emitted for tail calls. -; CHECK: _test02 -define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp { -entry: +define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind { +; VZ-LABEL: test02: +; VZ: # BB#0: +; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; VZ-NEXT: vzeroupper +; VZ-NEXT: jmp do_sse # TAILCALL +; +; NO-VZ-LABEL: test02: +; NO-VZ: # BB#0: +; NO-VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; NO-VZ-NEXT: jmp do_sse # TAILCALL %add.i = fadd <8 x float> %a, %b %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) - ; CHECK: vzeroupper - ; CHECK: jmp _do_sse %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind ret <4 x float> %call3 } @@ -59,30 +100,113 @@ entry: ;; Test the pass convergence and also that vzeroupper is only issued when necessary, ;; for this function it should be only once -; CHECK: _test03 -define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { +; VZ-LABEL: test03: +; VZ: # BB#0: # %entry +; VZ-NEXT: pushq %rbx +; VZ-NEXT: subq $16, %rsp +; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; VZ-NEXT: .p2align 4, 0x90 +; VZ-NEXT: .LBB3_1: # %while.cond +; VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-NEXT: callq foo +; VZ-NEXT: testl %eax, %eax +; VZ-NEXT: jne .LBB3_1 +; VZ-NEXT: # BB#2: # %for.body.preheader +; VZ-NEXT: movl $4, %ebx +; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; VZ-NEXT: .p2align 4, 0x90 +; VZ-NEXT: .LBB3_3: # %for.body +; VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-NEXT: callq do_sse +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; VZ-NEXT: vzeroupper +; VZ-NEXT: callq do_sse +; VZ-NEXT: decl %ebx +; VZ-NEXT: jne .LBB3_3 +; VZ-NEXT: # BB#4: # %for.end +; VZ-NEXT: addq $16, %rsp +; VZ-NEXT: popq %rbx +; VZ-NEXT: retq +; +; FAST-YMM-ZMM-LABEL: test03: +; FAST-YMM-ZMM: # BB#0: # %entry +; FAST-YMM-ZMM-NEXT: pushq %rbx +; FAST-YMM-ZMM-NEXT: subq $16, %rsp +; FAST-YMM-ZMM-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; FAST-YMM-ZMM-NEXT: .p2align 4, 0x90 +; FAST-YMM-ZMM-NEXT: .LBB3_1: # %while.cond +; FAST-YMM-ZMM-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-YMM-ZMM-NEXT: callq foo +; FAST-YMM-ZMM-NEXT: testl %eax, %eax +; FAST-YMM-ZMM-NEXT: jne .LBB3_1 +; FAST-YMM-ZMM-NEXT: # BB#2: # %for.body.preheader +; FAST-YMM-ZMM-NEXT: movl $4, %ebx +; FAST-YMM-ZMM-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; FAST-YMM-ZMM-NEXT: .p2align 4, 0x90 +; FAST-YMM-ZMM-NEXT: .LBB3_3: # %for.body +; FAST-YMM-ZMM-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; FAST-YMM-ZMM-NEXT: vextractf128 $1, %ymm0, %xmm0 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: decl %ebx +; FAST-YMM-ZMM-NEXT: jne .LBB3_3 +; FAST-YMM-ZMM-NEXT: # BB#4: # %for.end +; FAST-YMM-ZMM-NEXT: addq $16, %rsp +; FAST-YMM-ZMM-NEXT: popq %rbx +; FAST-YMM-ZMM-NEXT: retq +; +; BTVER2-LABEL: test03: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: pushq %rbx +; BTVER2-NEXT: subq $16, %rsp +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BTVER2-NEXT: .p2align 4, 0x90 +; BTVER2-NEXT: .LBB3_1: # %while.cond +; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 +; BTVER2-NEXT: callq foo +; BTVER2-NEXT: testl %eax, %eax +; BTVER2-NEXT: jne .LBB3_1 +; BTVER2-NEXT: # BB#2: # %for.body.preheader +; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; BTVER2-NEXT: movl $4, %ebx +; BTVER2-NEXT: .p2align 4, 0x90 +; BTVER2-NEXT: .LBB3_3: # %for.body +; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: decl %ebx +; BTVER2-NEXT: jne .LBB3_3 +; BTVER2-NEXT: # BB#4: # %for.end +; BTVER2-NEXT: addq $16, %rsp +; BTVER2-NEXT: popq %rbx +; BTVER2-NEXT: retq entry: %add.i = fadd <4 x float> %a, %b br label %while.cond -while.cond: +while.cond: %call = tail call i32 @foo() %tobool = icmp eq i32 %call, 0 br i1 %tobool, label %for.body, label %while.cond for.body: - ; CHECK: LBB - ; CHECK-NOT: vzeroupper %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] - ; CHECK: callq _do_sse %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind - ; CHECK-NEXT: callq _do_sse %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind %tmp11 = load <8 x float>, <8 x float>* @g, align 32 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind - ; CHECK: vzeroupper - ; CHECK-NEXT: callq _do_sse %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind %1 = add nsw i32 %i.018, 1 %exitcond = icmp eq i32 %1, 4 @@ -94,15 +218,30 @@ for.end: ;; Check that we also perform vzeroupper when we return from a function. -; CHECK: _test04 -define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { -entry: +define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind { +; VZ-LABEL: test04: +; VZ: # BB#0: +; VZ-NEXT: pushq %rax +; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> +; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VZ-NEXT: callq do_avx +; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; VZ-NEXT: popq %rax +; VZ-NEXT: vzeroupper +; VZ-NEXT: retq +; +; NO-VZ-LABEL: test04: +; NO-VZ: # BB#0: +; NO-VZ-NEXT: pushq %rax +; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> +; NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NO-VZ-NEXT: callq do_avx +; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; NO-VZ-NEXT: popq %rax +; NO-VZ-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ; CHECK-NOT: vzeroupper - ; CHECK: call %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> - ; CHECK: vzeroupper - ; CHECK: ret ret <4 x float> %shuf2 } + diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 3337f42eb142..51f9a382ccbf 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2216,9 +2216,9 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) diff --git a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll new file mode 100644 index 000000000000..019c5282f63b --- /dev/null +++ b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86_64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The following tests check that patterns that includes ;; +;; ctpop intrinsic + select are translated to the vpopcntd/q ;; +;; instruction in a correct way. ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) { +; X86_64-LABEL: test_mask_vpopcnt_d: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mask_vpopcnt_d: +; X86: # BB#0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a + ret <16 x i32> %3 +} + +define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) { +; X86_64-LABEL: test_maskz_vpopcnt_d: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_maskz_vpopcnt_d: +; X86: # BB#0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; X86_64-LABEL: test_mask_vpopcnt_q: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8] +; X86_64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mask_vpopcnt_q: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 +} + +define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) { +; X86_64-LABEL: test_maskz_vpopcnt_q: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_maskz_vpopcnt_q: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll index 1af30e9f32fe..4a8e8792f98d 100644 --- a/test/CodeGen/X86/fast-isel-select-cmp.ll +++ b/test/CodeGen/X86/fast-isel-select-cmp.ll @@ -4,9 +4,9 @@ ; different basic blocks. define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) { -; CHECK-LABEL: select_cmp_cmov_i32 +; CHECK-LABEL: select_cmp_cmov_i32: ; CHECK-LABEL: continue -; CHECK-NOT: cmp +; CHECK-NOT: cmp{{[^_]}} %1 = icmp ult i32 %a, %b br i1 %1, label %continue, label %exit @@ -19,9 +19,9 @@ exit: } define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) { -; CHECK-LABEL: select_fcmp_oeq_f32 +; CHECK-LABEL: select_fcmp_oeq_f32: ; CHECK-LABEL: continue -; CHECK-NOT: cmp +; CHECK-NOT: cmp{{[^_]}} %1 = fcmp oeq float %a, %b br i1 %1, label %continue, label %exit @@ -34,7 +34,7 @@ exit: } define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) { -; CHECK-LABEL: select_fcmp_one_f32 +; CHECK-LABEL: select_fcmp_one_f32: ; CHECK-LABEL: continue ; CHECK-NOT: ucomi %1 = fcmp one float %a, %b diff --git a/test/CodeGen/X86/fp-intrinsics.ll b/test/CodeGen/X86/fp-intrinsics.ll index 88aef6bb0659..0f8d730d7535 100644 --- a/test/CodeGen/X86/fp-intrinsics.ll +++ b/test/CodeGen/X86/fp-intrinsics.ll @@ -103,9 +103,156 @@ if.end: ret double %a.0 } +; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f5 +; CHECK: sqrtsd +define double @f5() { +entry: + %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f6 +; CHECK: pow +define double @f6() { +entry: + %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, + double 3.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f7 +; CHECK: powi +define double @f7() { +entry: + %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, + i32 3, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that sin(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f8 +; CHECK: sin +define double @f8() { +entry: + %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that cos(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f9 +; CHECK: cos +define double @f9() { +entry: + %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f10 +; CHECK: exp +define double @f10() { +entry: + %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f11 +; CHECK: exp2 +define double @f11() { +entry: + %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f12 +; CHECK: log +define double @f12() { +entry: + %result = call double @llvm.experimental.constrained.log.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log10(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f13 +; CHECK: log10 +define double @f13() { +entry: + %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log2(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f14 +; CHECK: log2 +define double @f14() { +entry: + %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that rint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f15 +; CHECK: rint +define double @f15() { +entry: + %result = call double @llvm.experimental.constrained.rint.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that nearbyint(42.1) isn't simplified when the rounding mode is +; unknown. +; CHECK-LABEL: f16 +; CHECK: nearbyint +define double @f16() { +entry: + %result = call double @llvm.experimental.constrained.nearbyint.f64( + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) +declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll index 5ade5b470b54..e7929c9cecdc 100644 --- a/test/CodeGen/X86/hoist-invariant-load.ll +++ b/test/CodeGen/X86/hoist-invariant-load.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted" +; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machinelicm.*hoisted" ; For test: ; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_ ; and 1 for objc_msgSend from the GOT diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll index 3e3729285d27..7abd157f147a 100644 --- a/test/CodeGen/X86/misched-copy.ll +++ b/test/CodeGen/X86/misched-copy.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; ; Test scheduling of copy instructions. ; diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll index 4899a0fc7e88..71d7746642e9 100644 --- a/test/CodeGen/X86/or-branch.ll +++ b/test/CodeGen/X86/or-branch.ll @@ -1,16 +1,34 @@ -; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 --check-prefix=CHECK -; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 --check-prefix=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind { ; JUMP2-LABEL: foo: -; JUMP2-DAG: jl -; JUMP2-DAG: je +; JUMP2: # BB#0: # %entry +; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP2-NEXT: jl .LBB0_3 +; JUMP2-NEXT: # BB#1: # %entry +; JUMP2-NEXT: movl {{[0-9]+}}(%esp), %eax +; JUMP2-NEXT: testl %eax, %eax +; JUMP2-NEXT: je .LBB0_3 +; JUMP2-NEXT: # BB#2: # %UnifiedReturnBlock +; JUMP2-NEXT: retl +; JUMP2-NEXT: .LBB0_3: # %cond_true +; JUMP2-NEXT: jmp bar # TAILCALL ; ; JUMP1-LABEL: foo: -; JUMP1-DAG: sete -; JUMP1-DAG: setl -; JUMP1: orb -; JUMP1: jne +; JUMP1: # BB#0: # %entry +; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP1-NEXT: sete %al +; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setl %cl +; JUMP1-NEXT: orb %al, %cl +; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: jne .LBB0_1 +; JUMP1-NEXT: # BB#2: # %cond_true +; JUMP1-NEXT: jmp bar # TAILCALL +; JUMP1-NEXT: .LBB0_1: # %UnifiedReturnBlock +; JUMP1-NEXT: retl entry: %tmp1 = icmp eq i32 %X, 0 %tmp3 = icmp slt i32 %Y, 5 @@ -29,11 +47,33 @@ UnifiedReturnBlock: ; regardless of whether they are expensive or not. define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind { -; CHECK-LABEL: unpredictable: -; CHECK-DAG: sete -; CHECK-DAG: setl -; CHECK: orb -; CHECK: jne +; JUMP2-LABEL: unpredictable: +; JUMP2: # BB#0: # %entry +; JUMP2-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP2-NEXT: sete %al +; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP2-NEXT: setl %cl +; JUMP2-NEXT: orb %al, %cl +; JUMP2-NEXT: cmpb $1, %cl +; JUMP2-NEXT: jne .LBB1_1 +; JUMP2-NEXT: # BB#2: # %cond_true +; JUMP2-NEXT: jmp bar # TAILCALL +; JUMP2-NEXT: .LBB1_1: # %UnifiedReturnBlock +; JUMP2-NEXT: retl +; +; JUMP1-LABEL: unpredictable: +; JUMP1: # BB#0: # %entry +; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP1-NEXT: sete %al +; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setl %cl +; JUMP1-NEXT: orb %al, %cl +; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: jne .LBB1_1 +; JUMP1-NEXT: # BB#2: # %cond_true +; JUMP1-NEXT: jmp bar # TAILCALL +; JUMP1-NEXT: .LBB1_1: # %UnifiedReturnBlock +; JUMP1-NEXT: retl entry: %tmp1 = icmp eq i32 %X, 0 %tmp3 = icmp slt i32 %Y, 5 diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir index 002761bc1e68..956df172b253 100644 --- a/test/CodeGen/X86/pr27681.mir +++ b/test/CodeGen/X86/pr27681.mir @@ -57,7 +57,7 @@ body: | %cl = SETNEr implicit %eflags ; Verify that removal of the %bl antidependence does not use %ch ; as a replacement register. - ; CHECK: %cl = AND8rr %cl, killed %b + ; CHECK: %cl = AND8rr killed %cl, killed %b %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags CMP32ri8 %ebp, -1, implicit-def %eflags %edx = MOV32ri 0 diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll index 2e31154068fc..8570fe7fe7ba 100644 --- a/test/CodeGen/X86/sandybridge-loads.ll +++ b/test/CodeGen/X86/sandybridge-loads.ll @@ -1,13 +1,20 @@ -; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s - -;CHECK-LABEL: wideloads: -;CHECK: vmovaps -;CHECK: vinsertf128 -;CHECK: vmovaps -;CHECK-NOT: vinsertf128 -;CHECK: ret +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +; CHECK-LABEL: wideloads: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 ; <---- unaligned! %v1 = load <8 x float>, <8 x float>* %b, align 32 ; <---- aligned! %m0 = fcmp olt <8 x float> %v1, %v0 @@ -19,17 +26,16 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ret void } -; CHECK: widestores -; loads: -; CHECK: vmovaps -; CHECK: vmovaps -; stores: -; CHECK: vmovaps -; CHECK: vextractf128 -; CHECK: vmovaps -;CHECK: ret - define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +; CHECK-LABEL: widestores: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 32 %v1 = load <8 x float>, <8 x float>* %b, align 32 store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll index 383ab21bd404..19305d0dad62 100644 --- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll @@ -354,9 +354,8 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) { ; X32-LABEL: test_mm_crc32_u8: ; X32: # BB#0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: crc32b %cl, %eax +; X32-NEXT: crc32b {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_crc32_u8: @@ -372,9 +371,8 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) { ; X32-LABEL: test_mm_crc32_u16: ; X32: # BB#0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: crc32w %cx, %eax +; X32-NEXT: crc32w {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_crc32_u16: diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll index 72542f499087..a00d47bb13e9 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -1651,26 +1651,9 @@ define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) { } declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone -define double @stack_fold_sqrtsd(double %a0) { - ;CHECK-LABEL: stack_fold_sqrtsd - ;CHECK: vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call double @llvm.sqrt.f64(double %a0) - ret double %2 -} -declare double @llvm.sqrt.f64(double) nounwind readnone - +; TODO stack_fold_sqrtsd ; TODO stack_fold_sqrtsd_int - -define float @stack_fold_sqrtss(float %a0) { - ;CHECK-LABEL: stack_fold_sqrtss - ;CHECK: vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call float @llvm.sqrt.f32(float %a0) - ret float %2 -} -declare float @llvm.sqrt.f32(float) nounwind readnone - +; TODO stack_fold_sqrtss ; TODO stack_fold_sqrtss_int define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll index cbcde0655597..9da071f7ede6 100644 --- a/test/CodeGen/X86/twoaddr-coalesce-2.ll +++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \ -; RUN: grep "twoaddrinstr" | grep "Number of instructions aggressively commuted" +; RUN: grep "twoaddressinstruction" | grep "Number of instructions aggressively commuted" ; rdar://6480363 target triple = "i386-apple-darwin9.6" diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll index f737ea2b7fba..4d183f3172b3 100644 --- a/test/CodeGen/X86/vector-narrow-binop.ll +++ b/test/CodeGen/X86/vector-narrow-binop.ll @@ -22,17 +22,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d ; ; AVX1-LABEL: PR32790: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR32790: @@ -60,46 +60,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE-LABEL: do_not_use_256bit_op: ; SSE: # BB#0: -; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: do_not_use_256bit_op: -; AVX1: # BB#0: -; AVX1-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> -; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: do_not_use_256bit_op: -; AVX2: # BB#0: -; AVX2-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: do_not_use_256bit_op: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: do_not_use_256bit_op: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %and = and <8 x i32> %concat1, %concat2 diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index 27909c6bb4a0..adda108bdc77 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: @@ -81,19 +82,41 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out } @@ -193,23 +216,49 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) ret <4 x i32> %out } diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index 7a675619d720..accbad35e9d7 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -39,6 +40,13 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out } @@ -92,6 +100,13 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out } @@ -137,6 +152,21 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out } @@ -173,6 +203,18 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index cf4f21e62b61..aa50206e7a5e 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-LABEL: testv8i64: @@ -39,6 +40,11 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in) ret <8 x i64> %out } @@ -92,6 +98,11 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in) ret <16 x i32> %out } @@ -135,6 +146,30 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in) ret <32 x i16> %out } @@ -169,6 +204,24 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in) ret <64 x i8> %out } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index fa3471c2fe40..2e65bd8c75c7 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -282,8 +282,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rsi), %zmm0 -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovups 32(%rsi), %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index 5aab21749d14..706edd27a3f1 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -511,11 +511,10 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL64-LABEL: expand14: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; @@ -529,11 +528,10 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL32-LABEL: expand14: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0> @@ -545,39 +543,35 @@ define <8 x float> @expand14(<4 x float> %a) { define <8 x float> @expand15(<4 x float> %a) { ; SKX64-LABEL: expand15: ; SKX64: # BB#0: -; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX64-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX64-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand15: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; ; SKX32-LABEL: expand15: ; SKX32: # BB#0: -; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX32-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX32-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand15: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0> diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll index c5ac4466b5fa..13088b7fa5f2 100644 --- a/test/CodeGen/X86/vector-sqrt.ll +++ b/test/CodeGen/X86/vector-sqrt.ll @@ -5,8 +5,10 @@ define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtd2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1 +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq entry: @@ -27,10 +29,14 @@ declare double @sqrt(double) local_unnamed_addr #1 define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtf4: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index a5fac9ac6a41..d4fbb72bbe6d 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -3030,10 +3030,10 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_and_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 @@ -3786,10 +3786,10 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_xor_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 @@ -4542,10 +4542,10 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_or_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 22d0065b264f..a22a60756264 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -7,6 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 @@ -117,6 +118,17 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -284,6 +296,17 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -501,6 +524,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -700,6 +735,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -843,6 +890,25 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -984,6 +1050,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1106,6 +1191,22 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1224,6 +1325,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1258,6 +1375,12 @@ define <2 x i64> @foldv2i64() nounwind { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1280,6 +1403,12 @@ define <2 x i64> @foldv2i64u() nounwind { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1300,6 +1429,11 @@ define <4 x i32> @foldv4i32() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1319,6 +1453,11 @@ define <4 x i32> @foldv4i32u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1338,6 +1477,11 @@ define <8 x i16> @foldv8i16() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1357,6 +1501,11 @@ define <8 x i16> @foldv8i16u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1376,6 +1525,11 @@ define <16 x i8> @foldv16i8() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] @@ -1395,6 +1549,11 @@ define <16 x i8> @foldv16i8u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index a0b277ddd732..101ae95550e7 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 @@ -12,11 +13,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -28,6 +26,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -92,6 +92,17 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -117,11 +128,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -133,6 +141,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -182,6 +192,17 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -205,28 +226,27 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -234,12 +254,12 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32: @@ -307,6 +327,17 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -335,28 +366,27 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32u: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -364,12 +394,12 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32u: @@ -414,6 +444,17 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -442,32 +483,31 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -532,6 +572,25 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -557,32 +616,31 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -647,6 +705,25 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -674,27 +751,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -747,6 +823,22 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -771,27 +863,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -844,6 +935,22 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 2d1715949a5e..abbe964e983c 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64: @@ -64,6 +65,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) ret <8 x i64> %out } @@ -105,6 +115,15 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) ret <8 x i64> %out } @@ -186,6 +205,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0) ret <16 x i32> %out } @@ -231,6 +259,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1) ret <16 x i32> %out } @@ -305,6 +342,38 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out } @@ -379,6 +448,38 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out } @@ -441,6 +542,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out } @@ -503,6 +630,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out } diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll index fbaf500e8333..b5c7f86567a1 100644 --- a/test/CodeGen/X86/wide-integer-cmp.ll +++ b/test/CodeGen/X86/wide-integer-cmp.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s - define i32 @branch_eq(i64 %a, i64 %b) { ; CHECK-LABEL: branch_eq: ; CHECK: # BB#0: # %entry diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll index 6b2e4de5cdaa..42c4c23c6349 100644 --- a/test/CodeGen/X86/widened-broadcast.ll +++ b/test/CodeGen/X86/widened-broadcast.ll @@ -151,8 +151,7 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl ; ; AVX1-LABEL: load_splat_8i32_8i32_01010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -288,8 +287,7 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou ; ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -315,22 +313,10 @@ define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nou ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> @@ -513,8 +499,7 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8 ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -587,26 +572,10 @@ define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtabl ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_4f32_8f32_0000: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_4f32_8f32_0000: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_4f32_8f32_0000: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_4f32_8f32_0000: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer @@ -627,22 +596,10 @@ define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_splat_8f32_16f32_89898989: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8f32_16f32_89898989: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8f32_16f32_89898989: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8f32_16f32_89898989: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd 32(%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x float>, <16 x float>* %ptr %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9> diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 6fbec91e77a3..f4d0503f4a79 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -57,10 +57,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { ; AVX1: # BB#0: ; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; AVX1-NEXT: vmovups 96(%rdi), %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -69,10 +67,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { ; AVX2: # BB#0: ; AVX2-NEXT: vmovupd (%rdi), %ymm0 ; AVX2-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX2-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX2-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll index 683d7b05cf8c..9bc654861b69 100644 --- a/test/CodeGen/X86/x87.ll +++ b/test/CodeGen/X86/x87.ll @@ -1,13 +1,16 @@ ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87 ; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87 -; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" +; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone { ; X87-LABEL: test: ; NOX87-LABEL: test: + +; NOX87-NOT: {{ }}f{{.*}} + ; X87: fild ; NOX87: __floatunsisf %tmp = uitofp i32 %i to float |