aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-14 15:37:50 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-14 15:37:50 +0000
commit581a6d8501ff5614297da837b81ed3b6956361ea (patch)
tree985ee91d0ca1d3e6506ac5ff7e37f5b67adfec09 /test/CodeGen
parent909545a822eef491158f831688066f0ec2866938 (diff)
downloadsrc-581a6d8501ff5614297da837b81ed3b6956361ea.tar.gz
src-581a6d8501ff5614297da837b81ed3b6956361ea.zip
Vendor import of llvm release_40 branch r292009:vendor/llvm/llvm-release_40-r292009
Notes
Notes: svn path=/vendor/llvm/dist/; revision=312173 svn path=/vendor/llvm/llvm-release_40-r292009/; revision=312174; tag=vendor/llvm/llvm-release_40-r292009
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/AArch64/arm64-neon-copy.ll9
-rw-r--r--test/CodeGen/AArch64/arm64-nvcast.ll18
-rw-r--r--test/CodeGen/AArch64/bitreverse.ll35
-rw-r--r--test/CodeGen/AArch64/rbit.ll22
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir858
-rw-r--r--test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll262
-rw-r--r--test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll12
-rw-r--r--test/CodeGen/AMDGPU/fneg-combines.ll1282
-rw-r--r--test/CodeGen/AMDGPU/fp16_to_fp.ll29
-rw-r--r--test/CodeGen/AMDGPU/fp16_to_fp32.ll22
-rw-r--r--test/CodeGen/AMDGPU/fp16_to_fp64.ll16
-rw-r--r--test/CodeGen/AMDGPU/fp32_to_fp16.ll17
-rw-r--r--test/CodeGen/AMDGPU/insert_vector_elt.ll8
-rw-r--r--test/CodeGen/AMDGPU/local-stack-slot-bug.ll7
-rw-r--r--test/CodeGen/AMDGPU/mad-combine.ll16
-rw-r--r--test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll46
-rw-r--r--test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll840
-rw-r--r--test/CodeGen/AMDGPU/select-opt.ll161
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg.ll133
-rw-r--r--test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir597
-rw-r--r--test/CodeGen/AMDGPU/v_mac.ll66
-rw-r--r--test/CodeGen/ARM/fp16-promote.ll4
-rw-r--r--test/CodeGen/ARM/fpcmp_ueq.ll6
-rw-r--r--test/CodeGen/ARM/vdup.ll3
-rw-r--r--test/CodeGen/ARM/vpadd.ll248
-rw-r--r--test/CodeGen/ARM/vtrn.ll4
-rw-r--r--test/CodeGen/Mips/llvm-ir/extractelement.ll3
-rw-r--r--test/CodeGen/Mips/msa/immediates-bad.ll1681
-rw-r--r--test/CodeGen/Mips/msa/immediates.ll1276
-rw-r--r--test/CodeGen/Mips/msa/msa-nooddspreg.ll55
-rw-r--r--test/CodeGen/NVPTX/fast-math.ll17
-rw-r--r--test/CodeGen/PowerPC/change-no-infs.ll67
-rw-r--r--test/CodeGen/PowerPC/variable_elem_vec_extracts.ll6
-rw-r--r--test/CodeGen/WebAssembly/function-bitcasts.ll16
-rw-r--r--test/CodeGen/X86/atom-bypass-slow-division-64.ll51
-rw-r--r--test/CodeGen/X86/atom-bypass-slow-division.ll112
-rw-r--r--test/CodeGen/X86/atomic-eflags-reuse.ll80
-rw-r--r--test/CodeGen/X86/avx-cvt.ll22
-rwxr-xr-xtest/CodeGen/X86/avx-trunc.ll26
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll1388
-rw-r--r--test/CodeGen/X86/avx512-select.ll19
-rw-r--r--test/CodeGen/X86/avx512-trunc.ll110
-rw-r--r--test/CodeGen/X86/bypass-slow-division-32.ll240
-rw-r--r--test/CodeGen/X86/bypass-slow-division-64.ll78
-rw-r--r--test/CodeGen/X86/bypass-slow-division-tune.ll55
-rw-r--r--test/CodeGen/X86/change-unsafe-fp-math.ll56
-rw-r--r--test/CodeGen/X86/cmp.ll52
-rw-r--r--test/CodeGen/X86/cpus.ll1
-rw-r--r--test/CodeGen/X86/extractelement-index.ll16
-rw-r--r--test/CodeGen/X86/extractelement-legalization-store-ordering.ll10
-rw-r--r--test/CodeGen/X86/i64-mem-copy.ll3
-rw-r--r--test/CodeGen/X86/implicit-null-checks.mir2
-rw-r--r--test/CodeGen/X86/lzcnt-zext-cmp.ll2
-rw-r--r--test/CodeGen/X86/peephole.mir40
-rw-r--r--test/CodeGen/X86/slow-div.ll43
-rw-r--r--test/CodeGen/X86/slow-unaligned-mem.ll1
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll13
-rw-r--r--test/CodeGen/X86/vec_ins_extract-1.ll24
-rw-r--r--test/CodeGen/X86/vec_insert-4.ll6
-rw-r--r--test/CodeGen/X86/vec_insert-8.ll18
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll60
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll113
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll199
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-512.ll1123
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll109
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll160
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-512.ll1077
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll107
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll154
-rw-r--r--test/CodeGen/X86/vector-shift-shl-512.ll1071
-rw-r--r--test/CodeGen/X86/vector-shuffle-avx512.ll333
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-xop.ll14
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-128.ll1184
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-256.ll334
-rw-r--r--test/CodeGen/X86/x86-64-double-shifts-var.ll1
75 files changed, 11674 insertions, 4675 deletions
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index e91a1a42c233..8d9a8c06aa3c 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -904,8 +904,9 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
; CHECK-LABEL: test_extracts_inserts_varidx_extract:
; CHECK: str q0
-; CHECK: add x[[PTR:[0-9]+]], {{.*}}, w0, sxtw #1
-; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], [x[[PTR]]]
+; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7
+; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3
+; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], {{\[}}[[PTR]]{{\]}}
; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
; CHECK-DAG: ins v[[R]].h[3], v0.h[3]
@@ -922,7 +923,9 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
}
; CHECK-LABEL: test_extracts_inserts_varidx_insert:
-; CHECK: str h0, [{{.*}}, w0, sxtw #1]
+; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
+; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
+; CHECK: st1 { v0.h }[0], [x9]
; CHECK-DAG: ldr d[[R:[0-9]+]]
; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
index c3a1640ab012..ba2512718c4e 100644
--- a/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -1,10 +1,12 @@
; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
; CHECK-LABEL: _test:
-; CHECK: fmov.2d v0, #2.00000000
-; CHECK: str q0, [sp, #-16]!
-; CHECK: mov x8, sp
-; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG: fmov.2d v0, #2.00000000
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG: mov x9, sp
+; CHECK-DAG: str q0, [sp], #16
+; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
; CHECK: str s0, [x0]
define void @test(float * %p1, i32 %v1) {
@@ -16,9 +18,11 @@ entry:
; CHECK-LABEL: _test2
; CHECK: movi.16b v0, #63
-; CHECK: str q0, [sp, #-16]!
-; CHECK: mov x8, sp
-; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG: str q0, [sp], #16
+; CHECK-DAG: mov x9, sp
+; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
; CHECK: str s0, [x0]
define void @test2(float * %p1, i32 %v1) {
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
index 135bce3bdb6c..85496ab03214 100644
--- a/test/CodeGen/AArch64/bitreverse.ll
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -1,14 +1,18 @@
; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
-; These tests just check that the plumbing is in place for @llvm.bitreverse. The
-; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+; These tests just check that the plumbing is in place for @llvm.bitreverse.
declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
define <2 x i16> @f(<2 x i16> %a) {
; CHECK-LABEL: f:
-; CHECK: rev32
-; CHECK: ushr
+; CHECK: fmov [[REG1:w[0-9]+]], s0
+; CHECK-DAG: rbit [[REG2:w[0-9]+]], [[REG1]]
+; CHECK-DAG: fmov s0, [[REG2]]
+; CHECK-DAG: mov [[REG3:w[0-9]+]], v0.s[1]
+; CHECK-DAG: rbit [[REG4:w[0-9]+]], [[REG3]]
+; CHECK-DAG: ins v0.s[1], [[REG4]]
+; CHECK-DAG: ushr v0.2s, v0.2s, #16
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
}
@@ -17,26 +21,9 @@ declare i8 @llvm.bitreverse.i8(i8) readnone
define i8 @g(i8 %a) {
; CHECK-LABEL: g:
-; CHECK-DAG: rev [[RV:w.*]], w0
-; CHECK-DAG: and [[L4:w.*]], [[RV]], #0xf0f0f0f
-; CHECK-DAG: and [[H4:w.*]], [[RV]], #0xf0f0f0f0
-; CHECK-DAG: lsr [[S4:w.*]], [[H4]], #4
-; CHECK-DAG: orr [[R4:w.*]], [[S4]], [[L4]], lsl #4
-
-; CHECK-DAG: and [[L2:w.*]], [[R4]], #0x33333333
-; CHECK-DAG: and [[H2:w.*]], [[R4]], #0xcccccccc
-; CHECK-DAG: lsr [[S2:w.*]], [[H2]], #2
-; CHECK-DAG: orr [[R2:w.*]], [[S2]], [[L2]], lsl #2
-
-; CHECK-DAG: mov [[P1:w.*]], #1426063360
-; CHECK-DAG: mov [[N1:w.*]], #-1442840576
-; CHECK-DAG: and [[L1:w.*]], [[R2]], [[P1]]
-; CHECK-DAG: and [[H1:w.*]], [[R2]], [[N1]]
-; CHECK-DAG: lsr [[S1:w.*]], [[H1]], #1
-; CHECK-DAG: orr [[R1:w.*]], [[S1]], [[L1]], lsl #1
-
-; CHECK-DAG: lsr w0, [[R1]], #24
-; CHECK-DAG: ret
+; CHECK: rbit [[REG:w[0-9]+]], w0
+; CHECK-NEXT: lsr w0, [[REG]], #24
+; CHECK-NEXT: ret
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
}
diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
index 3404ae4b6bee..288a25bd65e3 100644
--- a/test/CodeGen/AArch64/rbit.ll
+++ b/test/CodeGen/AArch64/rbit.ll
@@ -1,5 +1,8 @@
; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+; The llvm.aarch64.rbit intrinsic should be auto-upgraded to the
+; target-independent bitreverse intrinsic.
+
; CHECK-LABEL: rbit32
; CHECK: rbit w0, w0
define i32 @rbit32(i32 %t) {
@@ -18,3 +21,22 @@ entry:
declare i64 @llvm.aarch64.rbit.i64(i64)
declare i32 @llvm.aarch64.rbit.i32(i32)
+
+; CHECK-LABEL: rbit_generic32
+; CHECK: rbit w0, w0
+define i32 @rbit_generic32(i32 %t) {
+entry:
+ %rbit = call i32 @llvm.bitreverse.i32(i32 %t)
+ ret i32 %rbit
+}
+
+; CHECK-LABEL: rbit_generic64
+; CHECK: rbit x0, x0
+define i64 @rbit_generic64(i64 %t) {
+entry:
+ %rbit = call i64 @llvm.bitreverse.i64(i64 %t)
+ ret i64 %rbit
+}
+
+declare i32 @llvm.bitreverse.i32(i32) readnone
+declare i64 @llvm.bitreverse.i64(i64) readnone
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
new file mode 100644
index 000000000000..34bb2588ad62
--- /dev/null
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -0,0 +1,858 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+ define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %and = and i32 %a, 1234567
+ store volatile i32 %and, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+ %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+ %a = load i32, i32 addrspace(1)* %gep.a
+ %and = and i32 %a, 1234567
+ store i32 %and, i32 addrspace(1)* %gep.out
+ ret void
+ }
+
+ define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %shl = shl i32 %a, 12
+ store volatile i32 %shl, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+ %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+ %a = load i32, i32 addrspace(1)* %gep.a
+ %shl = shl i32 %a, 12
+ store i32 %shl, i32 addrspace(1)* %gep.out
+ ret void
+ }
+
+ define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %ashr = ashr i32 %a, 12
+ store volatile i32 %ashr, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+ %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+ %a = load i32, i32 addrspace(1)* %gep.a
+ %ashr = ashr i32 %a, 12
+ store i32 %ashr, i32 addrspace(1)* %gep.out
+ ret void
+ }
+
+ define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %lshr = lshr i32 %a, 12
+ store volatile i32 %lshr, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+ %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+ %a = load i32, i32 addrspace(1)* %gep.a
+ %lshr = lshr i32 %a, 12
+ store i32 %lshr, i32 addrspace(1)* %gep.out
+ ret void
+ }
+
+ declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+ attributes #0 = { nounwind }
+ attributes #1 = { nounwind readnone }
+
+...
+---
+
+# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
+# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %10,
+name: s_fold_and_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_64_xexec }
+ - { id: 2, class: sreg_32_xm0 }
+ - { id: 3, class: sreg_32_xm0 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: sreg_128 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1
+
+ %0 = COPY %sgpr0_sgpr1
+ %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %2 = COPY %1.sub1
+ %3 = COPY %1.sub0
+ %4 = S_MOV_B32 61440
+ %5 = S_MOV_B32 -1
+ %6 = REG_SEQUENCE killed %2, 1, killed %3, 2, killed %4, 3, killed %5, 4
+ %7 = S_MOV_B32 1234567
+ %8 = S_MOV_B32 9999
+ %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
+ %10 = COPY %9
+ BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_and_imm_regimm_32{{$}}
+
+# GCN: %9 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %9,
+
+# GCN: %10 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %10
+
+# GCN: %11 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %11,
+
+# GCN: %12 = V_MOV_B32_e32 1234567, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %12,
+
+# GCN: %13 = V_MOV_B32_e32 63, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %13,
+
+name: v_fold_and_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 20, class: sreg_32_xm0 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: sreg_32_xm0 }
+ - { id: 27, class: vgpr_32 }
+ - { id: 28, class: vgpr_32 }
+ - { id: 29, class: vgpr_32 }
+ - { id: 30, class: vgpr_32 }
+ - { id: 31, class: vgpr_32 }
+ - { id: 32, class: vreg_64 }
+ - { id: 33, class: vreg_64 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+ - { id: 36, class: vgpr_32 }
+ - { id: 37, class: vreg_64 }
+ - { id: 44, class: vgpr_32 }
+
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
+ %32 = REG_SEQUENCE %3, 1, %31, 2
+ %33 = V_LSHLREV_B64 2, killed %32, implicit %exec
+ %20 = COPY %4.sub1
+ %44 = V_ADD_I32_e32 %4.sub0, %33.sub0, implicit-def %vcc, implicit %exec
+ %36 = COPY killed %20
+ %35 = V_ADDC_U32_e32 %33.sub1, %36, implicit-def %vcc, implicit %vcc, implicit %exec
+ %37 = REG_SEQUENCE %44, 1, killed %35, 2
+ %24 = V_MOV_B32_e32 982, implicit %exec
+ %26 = S_MOV_B32 1234567
+ %34 = V_MOV_B32_e32 63, implicit %exec
+
+ %27 = V_AND_B32_e64 %26, %24, implicit %exec
+ FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %28 = V_AND_B32_e64 %24, %26, implicit %exec
+ FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %29 = V_AND_B32_e32 %26, %24, implicit %exec
+ FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %30 = V_AND_B32_e64 %26, %26, implicit %exec
+ FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %31 = V_AND_B32_e64 %34, %34, implicit %exec
+ FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_shl_imm_regimm_32{{$}}
+# GC1: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %13,
+
+name: s_fold_shl_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0_xexec }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sreg_32_xm0 }
+ - { id: 13, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1
+
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_MOV_B32 1
+ %6 = COPY %4.sub1
+ %7 = COPY %4.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+ %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
+ %13 = COPY %12
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_fold_shl_imm_regimm_32{{$}}
+
+# GCN: %11 = V_MOV_B32_e32 40955904, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 0, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 2, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 7927808, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -8, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name: v_fold_shl_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64_xexec }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_64 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vreg_64 }
+ - { id: 17, class: vreg_64 }
+ - { id: 18, class: vgpr_32 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vgpr_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vgpr_32 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: sreg_32_xm0 }
+ - { id: 28, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %2 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+ %16 = REG_SEQUENCE %2, 1, %15, 2
+ %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+ %9 = COPY %3.sub1
+ %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+ %19 = COPY killed %9
+ %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+ %20 = REG_SEQUENCE %21, 1, killed %18, 2
+ %10 = V_MOV_B32_e32 9999, implicit %exec
+ %24 = V_MOV_B32_e32 3871, implicit %exec
+ %6 = V_MOV_B32_e32 1, implicit %exec
+ %7 = S_MOV_B32 1
+ %27 = S_MOV_B32 -4
+
+ %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %14 = V_LSHL_B32_e64 12, %7, implicit %exec
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %15 = V_LSHL_B32_e64 12, %24, implicit %exec
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 243, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name: s_fold_ashr_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0_xexec }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sreg_32_xm0 }
+ - { id: 13, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1
+
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_MOV_B32 999123
+ %6 = COPY %4.sub1
+ %7 = COPY %4.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+ %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
+ %13 = COPY %12
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ S_ENDPGM
+
+...
+
+# GCN-LABEL: name: v_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name: v_fold_ashr_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64_xexec }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vreg_64 }
+ - { id: 17, class: vreg_64 }
+ - { id: 18, class: vgpr_32 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vgpr_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vgpr_32 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: sreg_32_xm0 }
+ - { id: 28, class: vgpr_32 }
+ - { id: 32, class: sreg_32_xm0 }
+ - { id: 33, class: sreg_32_xm0 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %2 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+ %16 = REG_SEQUENCE %2, 1, %15, 2
+ %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+ %9 = COPY %3.sub1
+ %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+ %19 = COPY killed %9
+ %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+ %20 = REG_SEQUENCE %21, 1, killed %18, 2
+ %10 = V_MOV_B32_e32 999234234, implicit %exec
+ %24 = V_MOV_B32_e32 3871, implicit %exec
+ %6 = V_MOV_B32_e32 1000000, implicit %exec
+ %7 = S_MOV_B32 13424252
+ %8 = S_MOV_B32 4
+ %27 = S_MOV_B32 -4
+ %32 = S_MOV_B32 1
+ %33 = S_MOV_B32 3841
+ %34 = V_MOV_B32_e32 3841, implicit %exec
+ %35 = V_MOV_B32_e32 2, implicit %exec
+
+ %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %14 = V_ASHR_I32_e64 7, %32, implicit %exec
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 1048332, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name: s_fold_lshr_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0_xexec }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sreg_32_xm0 }
+ - { id: 13, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1
+
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_MOV_B32 -999123
+ %6 = COPY %4.sub1
+ %7 = COPY %4.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+ %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
+ %13 = COPY %12
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 1073741823, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name: v_fold_lshr_imm_regimm_32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64_xexec }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_32_xm0 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vreg_64 }
+ - { id: 17, class: vreg_64 }
+ - { id: 18, class: vgpr_32 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vgpr_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vgpr_32 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: sreg_32_xm0 }
+ - { id: 28, class: vgpr_32 }
+ - { id: 32, class: sreg_32_xm0 }
+ - { id: 33, class: sreg_32_xm0 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %2 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+ %16 = REG_SEQUENCE %2, 1, %15, 2
+ %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+ %9 = COPY %3.sub1
+ %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+ %19 = COPY killed %9
+ %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+ %20 = REG_SEQUENCE %21, 1, killed %18, 2
+ %10 = V_MOV_B32_e32 999234234, implicit %exec
+ %24 = V_MOV_B32_e32 3871, implicit %exec
+ %6 = V_MOV_B32_e32 1000000, implicit %exec
+ %7 = S_MOV_B32 13424252
+ %8 = S_MOV_B32 4
+ %27 = S_MOV_B32 -4
+ %32 = S_MOV_B32 1
+ %33 = S_MOV_B32 3841
+ %34 = V_MOV_B32_e32 3841, implicit %exec
+ %35 = V_MOV_B32_e32 2, implicit %exec
+
+ %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %14 = V_LSHR_B32_e64 7, %32, implicit %exec
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
new file mode 100644
index 000000000000..b74bce76f79c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -0,0 +1,262 @@
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+
+; FIXME: This should also fold when fma is actually fast if an FMA
+; exists in the original program.
+
+; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fadd fast float %fma, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+define void @fast_sub_fmuladd_fmul() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fsub fast float %fma, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ store volatile float %mul.u.v, float addrspace(1)* undef
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fadd fast float %fma, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ store volatile float %mul.u.v, float addrspace(1)* undef
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fadd fast float %z, %fma
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ store volatile float %fma, float addrspace(1)* undef
+ %add = fadd fast float %fma, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ store volatile float %fma, float addrspace(1)* undef
+ %add = fadd fast float %z, %fma
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
+; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
+
+; GCN: buffer_store_dword [[MUL]]
+; GCN: buffer_store_dword [[MAD]]
+define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fsub fast float %fma, %z
+ store volatile float %mul.u.v, float addrspace(1)* undef
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
+; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32
+define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %u = load volatile float, float addrspace(1)* undef
+ %v = load volatile float, float addrspace(1)* undef
+ %mul.u.v = fmul fast float %u, %v
+ %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+ %add = fsub fast float %fma, %z
+ store volatile float %fma, float addrspace(1)* undef
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 9df336c2c489..10acae092e9f 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -18,9 +18,9 @@ declare float @llvm.fabs.f32(float) #1
; VI: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
; VI: v_cndmask_b32_e32
-; VI: v_add_f32_e32
-; VI: v_mul_f32_e32
-; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
%a11 = fadd fast float %y, -1.0
%a12 = call float @llvm.fabs.f32(float %a11)
@@ -113,9 +113,9 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
; VI: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
; VI: v_cndmask_b32_e32
-; VI: v_add_f16_e32
-; VI: v_mul_f16_e32
-; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
%x = bitcast i16 %x.arg to half
%y = bitcast i16 %y.arg to half
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
new file mode 100644
index 000000000000..d555d8d871de
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -0,0 +1,1282 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+
+; --------------------------------------------------------------------------------
+; fadd tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %add = fadd float %a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %add = fadd float %a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %add, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %add = fadd float %a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ %use1 = fmul float %add, 4.0
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %add = fadd float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.b = fsub float -0.000000e+00, %b
+ %add = fadd float %a, %fneg.b
+ %fneg = fsub float -0.000000e+00, %add
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fneg.b = fsub float -0.000000e+00, %b
+ %add = fadd float %fneg.a, %fneg.b
+ %fneg = fsub float -0.000000e+00, %add
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %add = fadd float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %fneg.a, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %add = fadd float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %add
+ %use1 = fmul float %fneg.a, %c
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = fmul float %a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = fmul float %a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %mul, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = fmul float %a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ %use1 = fmul float %mul, 4.0
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = fmul float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.b = fsub float -0.000000e+00, %b
+ %mul = fmul float %a, %fneg.b
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fneg.b = fsub float -0.000000e+00, %b
+ %mul = fmul float %fneg.a, %fneg.b
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = fmul float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %fneg.a, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = fmul float %fneg.a, %b
+ %fneg = fsub float -0.000000e+00, %mul
+ %use1 = fmul float %fneg.a, %c
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fma tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %fma, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ %use1 = fmul float %fma, 4.0
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.b = fsub float -0.000000e+00, %b
+ %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fneg.b = fsub float -0.000000e+00, %b
+ %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fneg.c = fsub float -0.000000e+00, %c
+ %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.c = fsub float -0.000000e+00, %c
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %fneg.a, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ %use1 = fmul float %fneg.a, %d
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmad tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_C]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %c = load volatile float, float addrspace(1)* %c.gep
+ %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+ %fneg = fsub float -0.000000e+00, %fma
+ %use1 = fmul float %fma, 4.0
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_extend tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fpext = fpext float %a to double
+ %fneg = fsub double -0.000000e+00, %fpext
+ store double %fneg, double addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fpext = fpext float %fneg.a to double
+ %fneg = fsub double -0.000000e+00, %fpext
+ store double %fneg, double addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+; GCN: buffer_store_dword [[FNEG_A]]
+define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fpext = fpext float %fneg.a to double
+ %fneg = fsub double -0.000000e+00, %fpext
+ store volatile double %fneg, double addrspace(1)* %out.gep
+ store volatile float %fneg.a, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
+define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fpext = fpext float %a to double
+ %fneg = fsub double -0.000000e+00, %fpext
+ store volatile double %fneg, double addrspace(1)* %out.gep
+ store volatile double %fpext, double addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fpext = fpext float %a to double
+ %fneg = fsub double -0.000000e+00, %fpext
+ %mul = fmul double %fpext, 4.0
+ store volatile double %fneg, double addrspace(1)* %out.gep
+ store volatile double %mul, double addrspace(1)* %out.gep
+ ret void
+}
+
+; FIXME: Source modifiers not folded for f16->f32
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile half, half addrspace(1)* %a.gep
+ %fpext = fpext half %a to float
+ %fneg = fsub float -0.000000e+00, %fpext
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile float %fpext, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile half, half addrspace(1)* %a.gep
+ %fpext = fpext half %a to float
+ %fneg = fsub float -0.000000e+00, %fpext
+ %mul = fmul float %fpext, 4.0
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile float %mul, float addrspace(1)* %out.gep
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_round tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile double, double addrspace(1)* %a.gep
+ %fpround = fptrunc double %a to float
+ %fneg = fsub float -0.000000e+00, %fpround
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile double, double addrspace(1)* %a.gep
+ %fneg.a = fsub double -0.000000e+00, %a
+ %fpround = fptrunc double %fneg.a to float
+ %fneg = fsub float -0.000000e+00, %fpround
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
+; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}}
+define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile double, double addrspace(1)* %a.gep
+ %fneg.a = fsub double -0.000000e+00, %a
+ %fpround = fptrunc double %fneg.a to float
+ %fneg = fsub float -0.000000e+00, %fpround
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile double %fneg.a, double addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile double, double addrspace(1)* %a.gep
+ %fneg.a = fsub double -0.000000e+00, %a
+ %fpround = fptrunc double %fneg.a to float
+ %fneg = fsub float -0.000000e+00, %fpround
+ %use1 = fmul double %fneg.a, %c
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile double %use1, double addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fpround = fptrunc float %a to half
+ %fneg = fsub half -0.000000e+00, %fpround
+ store half %fneg, half addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fpround = fptrunc float %fneg.a to half
+ %fneg = fsub half -0.000000e+00, %fpround
+ store half %fneg, half addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
+; GCN: buffer_store_dword [[NEG]]
+; GCN: buffer_store_dword [[CVT]]
+define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile double, double addrspace(1)* %a.gep
+ %fpround = fptrunc double %a to float
+ %fneg = fsub float -0.000000e+00, %fpround
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile float %fpround, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fpround = fptrunc float %fneg.a to half
+ %fneg = fsub half -0.000000e+00, %fpround
+ store volatile half %fneg, half addrspace(1)* %out.gep
+ store volatile float %fneg.a, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fpround = fptrunc float %fneg.a to half
+ %fneg = fsub half -0.000000e+00, %fpround
+ %use1 = fmul float %fneg.a, %c
+ store volatile half %fneg, half addrspace(1)* %out.gep
+ store volatile float %use1, float addrspace(1)* undef
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
+ %fneg = fsub float -0.000000e+00, %rcp
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+ %fneg = fsub float -0.000000e+00, %rcp
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+ %fneg = fsub float -0.000000e+00, %rcp
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile float %fneg.a, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+ %fneg = fsub float -0.000000e+00, %rcp
+ %use1 = fmul float %fneg.a, %c
+ store volatile float %fneg, float addrspace(1)* %out.gep
+ store volatile float %use1, float addrspace(1)* undef
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
+ %fneg = fsub float -0.000000e+00, %rcp
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %mul, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.b = fsub float -0.000000e+00, %b
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %fneg.b = fsub float -0.000000e+00, %b
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %fneg.a, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fneg.a = fsub float -0.000000e+00, %a
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+ %fneg = fsub float -0.000000e+00, %mul
+ %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
+ store volatile float %fneg, float addrspace(1)* %out
+ store volatile float %use1, float addrspace(1)* %out
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; sin tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[K]], -[[A]]
+; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
+; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %sin = call float @llvm.sin.f32(float %a)
+ %fneg = fsub float -0.000000e+00, %sin
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %sin = call float @llvm.amdgcn.sin.f32(float %a)
+ %fneg = fsub float -0.000000e+00, %sin
+ store float %fneg, float addrspace(1)* %out.gep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.sin.f32(float) #1
+
+declare float @llvm.amdgcn.sin.f32(float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll
deleted file mode 100644
index 5a79ca82bc29..000000000000
--- a/test/CodeGen/AMDGPU/fp16_to_fp.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
-declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
- %val = load i16, i16 addrspace(1)* %in, align 2
- %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
- store float %cvt, float addrspace(1)* %out, align 4
- ret void
-}
-
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
-; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
- %val = load i16, i16 addrspace(1)* %in, align 2
- %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
- store double %cvt, double addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
new file mode 100644
index 000000000000..35e9541692db
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp32:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[RESULT]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RES:T[0-9]+\.[XYZW]]]
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
+; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
+; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+ %val = load i16, i16 addrspace(1)* %in, align 2
+ %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+ store float %cvt, float addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
new file mode 100644
index 000000000000..8b05d7b88a10
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp64:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+ %val = load i16, i16 addrspace(1)* %in, align 2
+ %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+ store double %cvt, double addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 67925ebd82b6..346ad822f293 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,12 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
-; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[RESULT]]
+; FUNC-LABEL: {{^}}test_convert_fp32_to_fp16:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_short [[RESULT]]
+
+; EG: MEM_RAT MSKOR
+; EG: VTX_READ_32
+; EG: FLT32_TO_FLT16
define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%val = load float, float addrspace(1)* %in, align 4
%cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7351665f06e4..2c538b16e743 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -207,11 +207,15 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
; GCN: buffer_load_ushort v{{[0-9]+}}, off
; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
+; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
+; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
+
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
; GCN: s_waitcnt
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index d49fa2bf48a7..2ef045dbb8eb 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -7,11 +7,14 @@
;
; CHECK-LABEL: {{^}}main:
+; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; TODO: add 0?
-; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
+; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index 0e6281940c24..d141281f36b8 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -273,12 +273,12 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI: buffer_store_dword [[RESULT]]
define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -306,15 +306,15 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
new file mode 100644
index 000000000000..559d464f36a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %rcp = call float @llvm.amdgcn.rcp.legacy(float %x)
+ %fneg = fsub float -0.0, %rcp
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
+ %fneg = fsub float -0.0, %mul
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
new file mode 100644
index 000000000000..d9d311cd032b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -0,0 +1,840 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fabs.x, float %fabs.y
+ %add = fadd float %select, %z
+ store float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
+define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %w = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fabs.x, float %fabs.y
+ %add0 = fadd float %select, %z
+ %add1 = fadd float %fabs.x, %w
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %add1, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 [[ADD:v[0-9]+]], |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[X_ABS]]
+define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fabs.x, float %fabs.y
+ %add0 = fadd float %select, %z
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %fabs.x, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
+define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %w = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fabs.x, float %fabs.y
+ %add0 = fadd float %select, %z
+ %add1 = fadd float %fabs.y, %w
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %add1, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_var_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float %fabs.x, float %y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_fabs_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float %fabs, float -1.0
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; FIXME: fabs should fold away
+; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
+define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, float -2.0, float -1.0
+ %fabs = call float @llvm.fabs.f32(float %select)
+ %add = fadd float %fabs, %x
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_posk_posk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, float 2.0, float 1.0
+ %add = fadd float %select, %x
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negk_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float -1.0, float %fabs
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float -1024.0, float %fabs
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_fabs_posk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+
+ %cmp = icmp eq i32 %c, 0
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float %fabs, float 1.0
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_posk_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %select = select i1 %cmp, float 1.0, float %fabs
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %fneg.y = fsub float -0.0, %y
+ %select = select i1 %cmp, float %fneg.x, float %fneg.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
+define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %w = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %fneg.y = fsub float -0.0, %y
+ %select = select i1 %cmp, float %fneg.x, float %fneg.y
+ %add0 = fadd float %select, %z
+ %add1 = fadd float %fneg.x, %w
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %add1, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[NEG_X]]
+define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %fneg.y = fsub float -0.0, %y
+ %select = select i1 %cmp, float %fneg.x, float %fneg.y
+ %add0 = fadd float %select, %z
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %fneg.x, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
+define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %w = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %fneg.y = fsub float -0.0, %y
+ %select = select i1 %cmp, float %fneg.x, float %fneg.y
+ %add0 = fadd float %select, %z
+ %add1 = fadd float %fneg.y, %w
+ store volatile float %add0, float addrspace(1)* undef
+ store volatile float %add1, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fneg_var_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float %fneg.x, float %y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float %fneg.x, float -1.0
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float %fneg.x, float 0x3FC45F3060000000
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
+
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float %fneg.x, float 0xBFC45F3060000000
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negk_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, float -2.0, float -1.0
+ %add = fadd float %select, %x
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_negliteralk_f32:
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0xc5000000
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, float -2048.0, float -4096.0
+ %add = fadd float %select, %x
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
+define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %select = select i1 %cmp, float -2.0, float -1.0
+ %fneg.x = fsub float -0.0, %select
+ %add = fadd float %fneg.x, %x
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_negk_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float -1.0, float %fneg.x
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_posk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float %fneg.x, float 1.0
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_posk_fneg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.0, %x
+ %select = select i1 %cmp, float 1.0, float %fneg.x
+ %add = fadd float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fneg.fabs.x, float %fabs.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+ %select = select i1 %cmp, float %fabs.x, float %fneg.fabs.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_neg_fabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.000000e+00, %x
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %select = select i1 %cmp, float %fneg.x, float %fabs.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_neg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.y = fsub float -0.000000e+00, %y
+ %select = select i1 %cmp, float %fabs.x, float %fneg.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fneg.x = fsub float -0.000000e+00, %x
+ %fabs.y = call float @llvm.fabs.f32(float %y)
+ %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+ %select = select i1 %cmp, float %fneg.x, float %fneg.fabs.y
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %fneg.y = fsub float -0.000000e+00, %y
+ %select = select i1 %cmp, float %fneg.y, float %fneg.fabs.x
+ %add = fadd float %select, %z
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_eq_u32_e64 vcc,
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %select = select i1 %cmp, float %fneg.fabs.x, float 4.0
+ %add = fmul float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_posk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %select = select i1 %cmp, float 4.0, float %fneg.fabs.x
+ %add = fmul float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %select = select i1 %cmp, float %fneg.fabs.x, float -4.0
+ %add = fmul float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+ %select = select i1 %cmp, float -4.0, float %fneg.fabs.x
+ %add = fmul float %select, %y
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_add_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %add = fadd float %x, 4.0
+ %fneg = fsub float -0.0, %add
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_sub_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %add = fsub float %x, 4.0
+ %fneg = fsub float -0.0, %add
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %mul = fmul float %x, 4.0
+ %fneg = fsub float -0.0, %mul
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fma_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fma = call float @llvm.fma.f32(float %x, float 4.0, float %z)
+ %fneg = fsub float -0.0, %fma
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fmad_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %z = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %fmad = call float @llvm.fmuladd.f32(float %x, float 4.0, float %z)
+ %fneg = fsub float -0.0, %fmad
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+; FIXME: This one should fold to rcp
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+ %x = load volatile float, float addrspace(1)* undef
+ %y = load volatile float, float addrspace(1)* undef
+ %cmp = icmp eq i32 %c, 0
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %x)
+ %fneg = fsub float -0.0, %rcp
+ %select = select i1 %cmp, float %fneg, float 2.0
+ store volatile float %select, float addrspace(1)* undef
+ ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
new file mode 100644
index 000000000000..ad358d33c405
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -0,0 +1,161 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure to test with f32 and i32 compares. If we have to use float
+; compares, we always have multiple condition registers. If we can do
+; scalar compares, we don't want to use multiple condition registers.
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+ %icmp0 = icmp ne i32 %a, %b
+ %icmp1 = icmp ne i32 %a, %c
+ %and = and i1 %icmp0, %icmp1
+ %select = select i1 %and, i32 %x, i32 %y
+ store i32 %select, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+ %fcmp0 = fcmp one float %a, %b
+ %fcmp1 = fcmp one float %a, %c
+ %and = and i1 %fcmp0, %fcmp1
+ %select = select i1 %and, i32 %x, i32 %y
+ store i32 %select, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+ %icmp0 = icmp ne i32 %a, %b
+ %icmp1 = icmp ne i32 %a, %c
+ %and = and i1 %icmp0, %icmp1
+ %select = select i1 %and, i64 %x, i64 %y
+ store i64 %select, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+ %fcmp0 = fcmp one float %a, %b
+ %fcmp1 = fcmp one float %a, %c
+ %and = and i1 %fcmp0, %fcmp1
+ %select = select i1 %and, i64 %x, i64 %y
+ store i64 %select, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
+define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+ %icmp0 = icmp ne i32 %a, %b
+ %icmp1 = icmp ne i32 %a, %c
+ %or = or i1 %icmp0, %icmp1
+ %select = select i1 %or, i32 %x, i32 %y
+ store i32 %select, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+ %fcmp0 = fcmp one float %a, %b
+ %fcmp1 = fcmp one float %a, %c
+ %or = or i1 %fcmp0, %fcmp1
+ %select = select i1 %or, i32 %x, i32 %y
+ store i32 %select, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+ %icmp0 = icmp ne i32 %a, %b
+ %icmp1 = icmp ne i32 %a, %c
+ %or = or i1 %icmp0, %icmp1
+ %select = select i1 %or, i64 %x, i64 %y
+ store i64 %select, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+ %fcmp0 = fcmp one float %a, %b
+ %fcmp1 = fcmp one float %a, %c
+ %or = or i1 %fcmp0, %fcmp1
+ %select = select i1 %or, i64 %x, i64 %y
+ store i64 %select, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}regression:
+; GCN: v_cmp_neq_f32_e64 vcc
+; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+
+define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+entry:
+ %cmp0 = fcmp oeq float %c0, 1.0
+ br i1 %cmp0, label %if0, label %endif
+
+if0:
+ %cmp1 = fcmp oeq float %c1, 0.0
+ br i1 %cmp1, label %if1, label %endif
+
+if1:
+ %cmp2 = xor i1 %cmp1, true
+ br label %endif
+
+endif:
+ %tmp0 = phi i1 [ true, %entry ], [ %cmp2, %if1 ], [ false, %if0 ]
+ %tmp2 = select i1 %tmp0, float 4.0, float 0.0
+ store float %tmp2, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index f9216d311471..84f9b7bb8064 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -2,6 +2,8 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: i16 promotion pass ruins the scalar cases when legal.
+
; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
; GCN: s_load_dword [[ARG:s[0-9]+]],
; GCN: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
@@ -659,6 +661,137 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
ret void
}
+; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+ %ld = load i32, i32 addrspace(2)* %ptr
+ %in = trunc i32 %ld to i16
+ %shl = shl i16 %in, 15
+ %sext = ashr i16 %shl, 15
+ store i16 %sext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+ %ld = load i32, i32 addrspace(2)* %ptr
+ %in = trunc i32 %ld to i16
+ %shl = shl i16 %in, 14
+ %sext = ashr i16 %shl, 14
+ store i16 %sext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16:
+; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
+
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
+ %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+
+ %in = load i16, i16 addrspace(1)* %gep
+ %shl = shl i16 %in, 15
+ %sext = ashr i16 %shl, 15
+ store i16 %sext, i16 addrspace(3)* %out.gep
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16_nonload:
+; GCN: {{buffer|flat}}_load_ushort [[VAL0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]]
+
+; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+ %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+ %a = load volatile i16, i16 addrspace(1)* %a.gep, align 2
+ %b = load volatile i16, i16 addrspace(1)* %b.gep, align 2
+
+ %c = shl i16 %a, %b
+ %shl = shl i16 %c, 15
+ %ashr = ashr i16 %shl, 15
+
+ store i16 %ashr, i16 addrspace(3)* %out.gep, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+ %shl = shl i16 %in, 14
+ %sext = ashr i16 %shl, 14
+ store i16 %sext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i8_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_sext_i32_i8 [[SSEXT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+ %shl = shl i16 %in, 8
+ %sext = ashr i16 %shl, 8
+ store i16 %sext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i15_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0xf0000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+ %shl = shl i16 %in, 1
+ %sext = ashr i16 %shl, 1
+ store i16 %sext, i16 addrspace(1)* %out
+ ret void
+}
+
declare i32 @llvm.r600.read.tidig.x() #1
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
new file mode 100644
index 000000000000..1988a14b5845
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -0,0 +1,597 @@
+# RUN: llc -verify-machineinstrs -march=amdgcn -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+# Check that add with carry out isn't incorrectly reduced to e32 when
+# the carry out is a virtual register.
+
+# TODO: We should run this test until the end of codegen to make sure
+# that the post-RA run does manage to shrink it, but right now the
+# resume crashes
+
+--- |
+ define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = add i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = sub i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = sub i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = add i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = add i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+ %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.ptr
+ %b = load volatile i32, i32 addrspace(1)* %b.ptr
+ %result = add i32 %a, %b
+ store volatile i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+ }
+
+ declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+ attributes #0 = { nounwind }
+ attributes #1 = { nounwind readnone }
+
+...
+---
+# GCN-LABEL: name: shrink_add_vop3{{$}}
+# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+name: shrink_add_vop3
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_sub_vop3{{$}}
+# GCN: %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name: shrink_sub_vop3
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_subrev_vop3{{$}}
+# GCN: %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name: shrink_subrev_vop3
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: check_addc_src2_vop3{{$}}
+# GCN: %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name: check_addc_src2_vop3
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %9 = S_MOV_B64 0
+ %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_addc_vop3{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+
+name: shrink_addc_vop3
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %vcc = S_MOV_B64 0
+ %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
+
+---
+# GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name: shrink_addc_undef_vcc
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vreg_64 }
+ - { id: 21, class: sreg_32_xm0 }
+ - { id: 22, class: sreg_32 }
+ - { id: 23, class: sreg_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vreg_64 }
+ - { id: 28, class: vreg_64 }
+ - { id: 29, class: vgpr_32 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %27 = REG_SEQUENCE %3, 1, %26, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec
+ %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 027c63817903..16aed5928b0a 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -212,5 +212,71 @@ entry:
ret void
}
+; Without special casing the inline constant check for v_mac_f32's
+; src2, this fails to fold the 1.0 into a mad.
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
+ %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
+ %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+ %tmp = load volatile float, float addrspace(1)* %gep.a
+ %tmp1 = load volatile float, float addrspace(1)* %gep.b
+ %tmp2 = fadd float %tmp, %tmp
+ %tmp3 = fmul float %tmp2, 4.0
+ %tmp4 = fsub float 1.0, %tmp3
+ %tmp5 = fadd float %tmp4, %tmp1
+ %tmp6 = fadd float %tmp1, %tmp1
+ %tmp7 = fmul float %tmp6, %tmp
+ %tmp8 = fsub float 1.0, %tmp7
+ %tmp9 = fmul float %tmp8, 8.0
+ %tmp10 = fadd float %tmp5, %tmp9
+ store float %tmp10, float addrspace(1)* %gep.out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
+
+; FIXME: How is this not folded?
+; SI: v_cvt_f32_f16_e32 v{{[0-9]+}}, 0x3c00
+
+; VI: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; VI: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
+ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+ %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+ %tmp = load volatile half, half addrspace(1)* %gep.a
+ %tmp1 = load volatile half, half addrspace(1)* %gep.b
+ %tmp2 = fadd half %tmp, %tmp
+ %tmp3 = fmul half %tmp2, 4.0
+ %tmp4 = fsub half 1.0, %tmp3
+ %tmp5 = fadd half %tmp4, %tmp1
+ %tmp6 = fadd half %tmp1, %tmp1
+ %tmp7 = fmul half %tmp6, %tmp
+ %tmp8 = fsub half 1.0, %tmp7
+ %tmp9 = fmul half %tmp8, 8.0
+ %tmp10 = fadd half %tmp5, %tmp9
+ store half %tmp10, half addrspace(1)* %gep.out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
attributes #0 = { nounwind "unsafe-fp-math"="false" }
attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index ebc5934df022..824123687287 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -825,7 +825,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) #0 {
; CHECK-ALL: strh
; CHECK-ALL: mov
; CHECK-ALL-DAG: ldrh
-; CHECK-ALL-DAG: add
+; CHECK-ALL-DAG: orr
; CHECK-ALL: strh
; CHECK-ALL: ldrh
; CHECK-ALL: strh
@@ -855,7 +855,7 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
; CHECK-VFP: orr
; CHECK-VFP: str
; CHECK-VFP: mov
-; CHECK-VFP: add
+; CHECK-VFP: orr
; CHECK-VFP: ldrh
; CHECK-VFP: strh
; CHECK-VFP: add sp, sp, #8
diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll
index ba14140cdc44..c1696c9be1b7 100644
--- a/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -9,7 +9,11 @@ entry:
}
; CHECK-ARMv4-LABEL: f7:
-; CHECK-ARMv4: moveq r6, #1
+; CHECK-ARMv4-DAG: bl ___eqsf2
+; CHECK-ARMv4-DAG: bl ___unordsf2
+; CHECK-ARMv4: cmp r0, #0
+; CHECK-ARMv4: movne r0, #1
+; CHECK-ARMv4: orrs r0, r0,
; CHECK-ARMv4: moveq r0, #42
; CHECK-ARMv7-LABEL: f7:
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 25c4807d9862..b7693c797635 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -373,7 +373,8 @@ define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
; CHECK: mov r[[FP:[0-9]+]], sp
; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
; CHECK: mov r[[SPCOPY:[0-9]+]], sp
-; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]]
+; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15
+; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]]
; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
%x = extractelement <16 x i8> %v, i32 %idx
%1 = insertelement <8 x i8> undef, i8 %x, i32 0
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index 269223ac9f38..1aa23597cf49 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -214,14 +214,11 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
}
; Combine vuzp+vadd->vpadd.
-; FIXME: Implement this optimization
-define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADD:
+define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i8:
; CHECK: @ BB#0:
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vorr d18, d17, d17
-; CHECK-NEXT: vuzp.8 d16, d18
-; CHECK-NEXT: vadd.i8 d16, d18, d16
+; CHECK-NEXT: vpadd.i8 d16, d16, d17
; CHECK-NEXT: vstr d16, [r1]
; CHECK-NEXT: mov pc, lr
%tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -233,15 +230,44 @@ define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
ret void
}
+; Combine vuzp+vadd->vpadd.
+define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i16:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpadd.i16 d16, d16, d17
+; CHECK-NEXT: vstr d16, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <8 x i16>, <8 x i16>* %cbcr
+ %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %add = add <4 x i16> %tmp3, %tmp1
+ store <4 x i16> %add, <4 x i16>* %X, align 8
+ ret void
+}
+
+; Combine vtrn+vadd->vpadd.
+define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i32:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpadd.i32 d16, d16, d17
+; CHECK-NEXT: vstr d16, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <4 x i32>, <4 x i32>* %cbcr
+ %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+ %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+ %add = add <2 x i32> %tmp3, %tmp1
+ store <2 x i32> %add, <2 x i32>* %X, align 8
+ ret void
+}
+
; Combine vuzp+vaddl->vpaddl
-; FIXME: Implement this optimization.
-define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADDL_sext:
+define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s8:
; CHECK: @ BB#0:
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vorr d18, d17, d17
-; CHECK-NEXT: vuzp.8 d16, d18
-; CHECK-NEXT: vaddl.s8 q8, d18, d16
+; CHECK-NEXT: vpaddl.s8 q8, q8
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
; CHECK-NEXT: mov pc, lr
%tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -254,10 +280,200 @@ define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind s
ret void
}
-; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
-; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
-define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
-; CHECK-LABEL: fromExtendingExtractVectorElt:
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffles.
+define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_s8:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vmov.i16 d18, #0x8
+; CHECK-NEXT: vneg.s16 d18, d18
+; CHECK-NEXT: vext.8 d19, d16, d16, #1
+; CHECK-NEXT: vshl.i16 d16, d16, #8
+; CHECK-NEXT: vshl.i16 d17, d19, #8
+; CHECK-NEXT: vshl.s16 d16, d16, d18
+; CHECK-NEXT: vshl.s16 d17, d17, d18
+; CHECK-NEXT: vadd.i16 d16, d17, d16
+; CHECK-NEXT: vstr d16, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <16 x i8>, <16 x i8>* %cbcr
+ %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
+ %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
+ %add = add <4 x i16> %tmp4, %tmp5
+ store <4 x i16> %add, <4 x i16>* %X, align 8
+ ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpaddl.u8 q8, q8
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <16 x i8>, <16 x i8>* %cbcr
+ %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
+ %add = add <8 x i16> %tmp4, %tmp5
+ store <8 x i16> %add, <8 x i16>* %X, align 8
+ ret void
+}
+
+; In theory, it's possible to match this to vpaddl, but rearranging the
+; shuffle is awkward, so this doesn't match at the moment.
+define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vmovl.u8 q9, d17
+; CHECK-NEXT: vmovl.u8 q8, d16
+; CHECK-NEXT: vuzp.16 q8, q9
+; CHECK-NEXT: vadd.i16 q8, q8, q9
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <16 x i8>, <16 x i8>* %cbcr
+ %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %add = add <8 x i16> %tmp2, %tmp3
+ store <8 x i16> %add, <8 x i16>* %X, align 8
+ ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffle.
+define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vext.8 d18, d16, d16, #1
+; CHECK-NEXT: vbic.i16 d16, #0xff00
+; CHECK-NEXT: vbic.i16 d18, #0xff00
+; CHECK-NEXT: vadd.i16 d16, d18, d16
+; CHECK-NEXT: vstr d16, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <16 x i8>, <16 x i8>* %cbcr
+ %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+ %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
+ %add = add <4 x i16> %tmp4, %tmp5
+ store <4 x i16> %add, <4 x i16>* %X, align 8
+ ret void
+}
+
+; Matching to vpaddl.8 requires matching shuffle(zext()).
+define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vmovl.u8 q8, d16
+; CHECK-NEXT: vpadd.i16 d16, d16, d17
+; CHECK-NEXT: vstr d16, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <16 x i8>, <16 x i8>* %cbcr
+ %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %add = add <4 x i16> %tmp2, %tmp3
+ store <4 x i16> %add, <4 x i16>* %X, align 8
+ ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s16:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpaddl.s16 q8, q8
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <8 x i16>, <8 x i16>* %cbcr
+ %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
+ %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
+ %add = add <4 x i32> %tmp4, %tmp5
+ store <4 x i32> %add, <4 x i32>* %X, align 8
+ ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u16:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpaddl.u16 q8, q8
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <8 x i16>, <8 x i16>* %cbcr
+ %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
+ %add = add <4 x i32> %tmp4, %tmp5
+ store <4 x i32> %add, <4 x i32>* %X, align 8
+ ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s32:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpaddl.s32 q8, q8
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <4 x i32>, <4 x i32>* %cbcr
+ %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+ %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+ %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
+ %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
+ %add = add <2 x i64> %tmp4, %tmp5
+ store <2 x i64> %add, <2 x i64>* %X, align 8
+ ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u32:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vpaddl.u32 q8, q8
+; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: mov pc, lr
+ %tmp = load <4 x i32>, <4 x i32>* %cbcr
+ %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+ %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
+ %add = add <2 x i64> %tmp4, %tmp5
+ store <2 x i64> %add, <2 x i64>* %X, align 8
+ ret void
+}
+
+; Legalization promotes the <4 x i8> to <4 x i16>.
+define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
+; CHECK: @ BB#0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vpaddl.s8 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %x = add <4 x i8> %tmp2, %tmp1
+ ret <4 x i8> %x
+}
+
+; Legalization promotes the <2 x i16> to <2 x i32>.
+define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
; CHECK: @ BB#0:
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vpaddl.s16 d16, d16
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
index 36731e933bab..df6336043fdf 100644
--- a/test/CodeGen/ARM/vtrn.ll
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -70,14 +70,14 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.32 d17, d16
-; CHECK-NEXT: vadd.i32 d16, d17, d16
+; CHECK-NEXT: vmul.i32 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
- %tmp5 = add <2 x i32> %tmp3, %tmp4
+ %tmp5 = mul <2 x i32> %tmp3, %tmp4
ret <2 x i32> %tmp5
}
diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll
index 1e1b02df99a2..3c7df4a5e99f 100644
--- a/test/CodeGen/Mips/llvm-ir/extractelement.ll
+++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll
@@ -14,6 +14,7 @@ define i1 @via_stack_bug(i8 signext %idx) {
; ALL-DAG: addiu [[ONE:\$[0-9]+]], $zero, 1
; ALL-DAG: sb [[ONE]], 7($sp)
; ALL-DAG: sb $zero, 6($sp)
+; ALL-DAG: andi [[MASKED_IDX:\$[0-9]+]], $4, 1
; ALL-DAG: addiu [[VPTR:\$[0-9]+]], $sp, 6
-; ALL-DAG: addu [[EPTR:\$[0-9]+]], $4, [[VPTR]]
+; ALL-DAG: or [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]]
; ALL: lbu $2, 0([[EPTR]])
diff --git a/test/CodeGen/Mips/msa/immediates-bad.ll b/test/CodeGen/Mips/msa/immediates-bad.ll
new file mode 100644
index 000000000000..c6b8fcef649a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates-bad.ll
@@ -0,0 +1,1681 @@
+; RUN: not llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s 2> %t1
+; RUN: FileCheck %s < %t1
+
+; Test that the immediate intrinsics with out of range values trigger an error.
+
+
+define void @binsli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+; CHECK: LLVM ERROR: Immediate out of range
+
+define void @binsri_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 9)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 152)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 129)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+ %r = call <16 x i8> @llvm.mips.ldi.b(i32 1025)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 163)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 63)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 63)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 65)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+ %r = call <4 x i32> @llvm.mips.ldi.w(i32 1024)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 65)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 16)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 17)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 19)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 19)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 19)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 63)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 130)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 63)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 63)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 63)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 130)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 63)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 130)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+ %r = call <8 x i16> @llvm.mips.ldi.h(i32 1024)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 65)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 17)
+ ret i32 %r
+}
+
+
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 9)
+ ret i32 %r
+}
+
+
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 5)
+ ret i32 %r
+}
+
+
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 16)
+ ret i32 %r
+}
+
+
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 9)
+ ret i32 %r
+}
+
+
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 5)
+ ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry: %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 3)
+ ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry: %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 3)
+ ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 64)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+ %r = call <2 x i64> @llvm.mips.ldi.d(i32 1024)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 63)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 65)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}; Negative numbers
+
+
+define void @neg_addvi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_andi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bclri_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsri_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bmnzi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bmzi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bnegi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bseli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bseti_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 -5)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 -120)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 -35)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_ldi_b(<16 x i8> * %ptr) {
+entry:
+ %r = call <16 x i8> @llvm.mips.ldi.b(i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_nori_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_ori_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 -25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_sldi_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 -7)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_slli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_splati_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srai_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srari_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srli_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srlri_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 -3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @neg_addvi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bclri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bnegi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bseti_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 -140)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 -150)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 -25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 -200)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 -200)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 -200)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 -200)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_ldi_w(<4 x i32> * %ptr) {
+entry:
+ %r = call <4 x i32> @llvm.mips.ldi.w(i32 -300)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_sldi_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 -20)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_slli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_splati_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srai_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srari_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srli_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srlri_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 -3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @neg_addvi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 -25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bclri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 -8)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 -8)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 -15)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bnegi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 -14)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bseti_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 -15)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 -25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 -25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 -150)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 -25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 -200)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 -200)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 -200)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 -2)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_ldi_h(<8 x i16> * %ptr) {
+entry:
+ %r = call <8 x i16> @llvm.mips.ldi.h(i32 -300)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_sldi_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_slli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_splati_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srai_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srari_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srli_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srlri_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 -3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define i32 @neg_copy_s_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 -1)
+ ret i32 %r
+}
+
+define i32 @neg_copy_s_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 -1)
+ ret i32 %r
+}
+
+define i32 @neg_copy_s_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 -1)
+ ret i32 %r
+}
+
+define i32 @neg_copy_u_b(<16 x i8> * %ptr) {
+entry:
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 -1)
+ ret i32 %r
+}
+
+
+define i32 @neg_copy_u_h(<8 x i16> * %ptr) {
+entry:
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 -1)
+ ret i32 %r
+}
+
+
+define i32 @neg_copy_u_w(<4 x i32> * %ptr) {
+entry:
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 -1)
+ ret i32 %r
+}
+
+define i64 @neg_copy_s_d(<2 x i64> * %ptr) {
+entry: %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 -1)
+ ret i64 %r
+}
+
+define i64 @neg_copy_u_d(<2 x i64> * %ptr) {
+entry: %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 -1)
+ ret i64 %r
+}
+
+define void @neg_addvi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bclri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_binsri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bnegi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_bseti_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 -45)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clei_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 -32)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_clti_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 -25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_ldi_d(<2 x i64> * %ptr) {
+entry:
+ %r = call <2 x i64> @llvm.mips.ldi.d(i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 -202)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_maxi_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 -2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_s_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 -202)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_mini_u_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 -2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_sldi_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 -1)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_slli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srai_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srari_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srli_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @neg_srlri_d(<2 x i64> * %ptr) {
+entry:
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 -3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
diff --git a/test/CodeGen/Mips/msa/immediates.ll b/test/CodeGen/Mips/msa/immediates.ll
new file mode 100644
index 000000000000..b561ace30a8a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates.ll
@@ -0,0 +1,1276 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,MSA32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n32 < %s \
+; RUN: | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n64 < %s \
+; RUN: | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N64
+
+; Test that the immediate intrinsics don't crash LLVM.
+
+; Some of the intrinsics lower to equivalent forms.
+
+define void @addvi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_b:
+; CHECK: addvi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @andi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: andi_b:
+; CHECK: andi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_b:
+; CHECK: andi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_b:
+; CHECK: binsli.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+ %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %b, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_b:
+; CHECK: binsri.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+ %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %b, i32 5)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmnzi_b:
+; CHECK: bmnzi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+ %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmzi_b:
+; CHECK: bmnzi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+ %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_b:
+; CHECK: bnegi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseli_b:
+; CHECK: bseli.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_b:
+; CHECK: bseti.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 5)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_b:
+; CHECK: clei_s.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 12)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_b:
+; CHECK: clei_u.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_b:
+; CHECK: clti_s.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 15)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_b:
+; CHECK: clti_u.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_b:
+; CHECK: ldi.b
+ %r = call <16 x i8> @llvm.mips.ldi.b(i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_b:
+; CHECK: maxi_s.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_b:
+; CHECK: maxi_u.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_b:
+; CHECK: mini_s.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_b:
+; CHECK: mini_u.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: nori_b:
+; CHECK: nori.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ori_b:
+; CHECK: ori.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 25)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_b:
+; CHECK: sldi.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: slli_b:
+; CHECK: slli.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: splati_b:
+; CHECK: splati.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srai_b:
+; CHECK: srai.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srari_b:
+; CHECK: srari.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srli_b:
+; CHECK: srli.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_b:
+; CHECK: srlri.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 3)
+ store <16 x i8> %r, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @ld_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_b
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK: ld.b $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 %offset)
+ store <16 x i8> %a, <16 x i8> * %ptr, align 16
+ ret void
+}
+
+define void @st_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_b
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.b $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 0)
+ call void @llvm.mips.st.b(<16 x i8> %a, i8* %stptr, i32 %offset)
+ ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_w:
+; CHECK: addvi.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_w:
+; CHECK: bclri.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_w:
+; CHECK: binsli.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+ %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_w:
+; CHECK: binsri.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+ %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_w:
+; CHECK: bnegi.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_w:
+; CHECK: bseti.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_w:
+; CHECK: clei_s.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 14)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_w:
+; CHECK: clei_u.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_w:
+; CHECK: clti_s.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 15)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_w:
+; CHECK: clti_u.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 25)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_w:
+; CHECK: maxi_s.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 2)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_w:
+; CHECK: maxi_u.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 2)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_w:
+; CHECK: mini_s.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 2)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_w:
+; CHECK: mini_u.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 2)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_w:
+; CHECK: ldi.w
+ %r = call <4 x i32> @llvm.mips.ldi.w(i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_w:
+; CHECK: sldi.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 2)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: slli_w:
+; CHECK: slli.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: splati_w:
+; CHECK: splati.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srai_w:
+; CHECK: srai.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srari_w:
+; CHECK: srari.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srli_w:
+; CHECK: srli.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_w:
+; CHECK: srlri.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 3)
+ store <4 x i32> %r, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @ld_w(<4 x i32> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_w
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.w $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 %offset)
+ store <4 x i32> %a, <4 x i32> * %ptr, align 16
+ ret void
+}
+
+define void @st_w(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_w
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.w $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 0)
+ call void @llvm.mips.st.w(<4 x i32> %a, i8* %stptr, i32 %offset)
+ ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_h:
+; CHECK: addvi.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_h:
+; CHECK: bclri.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 8)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_h:
+; CHECK: binsli.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+ %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %b, i32 8)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_h:
+; CHECK: binsri.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+ %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %b, i32 15)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_h:
+; CHECK: bnegi.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 14)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_h:
+; CHECK: bseti.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 15)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_h:
+; CHECK: clei_s.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 13)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_h:
+; CHECK: clei_u.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_h:
+; CHECK: clti_s.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 15)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_h:
+; CHECK: clti_u.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 25)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_h:
+; CHECK: maxi_s.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 2)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_h:
+; CHECK: maxi_u.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 2)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_h:
+; CHECK: mini_s.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 2)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_h:
+; CHECK: mini_u.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 2)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_h:
+; CHECK: ldi.h
+ %r = call <8 x i16> @llvm.mips.ldi.h(i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_h:
+; CHECK: sldi.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: slli_h:
+; CHECK: slli.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: splati_h:
+; CHECK: splati.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srai_h:
+; CHECK: srai.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srari_h:
+; CHECK: srari.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srli_h:
+; CHECK: srli.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_h:
+; CHECK: srlri.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 3)
+ store <8 x i16> %r, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @ld_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_h
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK: ld.h $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 %offset)
+ store <8 x i16> %a, <8 x i16> * %ptr, align 16
+ ret void
+}
+
+define void @st_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_h
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.h $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 0)
+ call void @llvm.mips.st.h(<8 x i16> %a, i8* %stptr, i32 %offset)
+ ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_b:
+; CHECK: copy_s.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 1)
+ ret i32 %r
+}
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_h:
+; CHECK: copy_s.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 1)
+ ret i32 %r
+}
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_w:
+; CHECK: copy_s.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 1)
+ ret i32 %r
+}
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_b:
+; CHECK: copy_u.b
+ %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 1)
+ ret i32 %r
+}
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_h:
+; CHECK: copy_u.h
+ %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 1)
+ ret i32 %r
+}
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_w:
+; MSA32: copy_s.w
+; MSA64: copy_u.w
+ %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+ %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 1)
+ ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 1)
+ ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 1)
+ ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_d:
+; CHECK: addvi.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_d:
+; CHECK: and.v
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 16)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_d:
+; CHECK: bsel.v
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+ %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %b, i32 4)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_d:
+; CHECK: binsri.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+ %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %b, i32 5)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_d:
+; CHECK: xor.v
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 9)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_d:
+; CHECK: or.v
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_d:
+; CHECK: clei_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 15)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_d:
+; CHECK: clei_u.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_d:
+; CHECK: clti_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 15)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_d:
+; CHECK: clti_u.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 25)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_d:
+; CHECK: ldi.d
+ %r = call <2 x i64> @llvm.mips.ldi.d(i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_d:
+; CHECK: maxi_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_d:
+; CHECK: maxi_u.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_d:
+; CHECK: mini_s.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_d:
+; CHECK: mini_u.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 2)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_d:
+; CHECK: sldi.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: slli_d:
+; CHECK: slli.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srai_d:
+; CHECK: srai.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srari_d:
+; CHECK: srari.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srli_d:
+; CHECK: srli.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_d:
+; CHECK: srlri.d
+ %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+ %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 3)
+ store <2 x i64> %r, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @ld_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_d
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 %offset)
+ store <2 x i64> %a, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @ld_d2(<2 x i64> * %ptr, i8 * %ldptr) {
+entry:
+; CHECK-LABEL: ld_d2
+; MSA32: addiu $[[R0:[0-9]]], $5, 4096
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addiu $[[R0:[0-9]]], $[[R1]], 4096
+; MSA64N64: daddiu $[[R0:[0-9]]], $5, 4096
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 4096)
+ store <2 x i64> %a, <2 x i64> * %ptr, align 16
+ ret void
+}
+
+define void @st_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_d
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.d $w{{[0-9]+}}, 0($[[R0]])
+ %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 0)
+ call void @llvm.mips.st.d(<2 x i64> %a, i8* %stptr, i32 %offset)
+ ret void
+}
+
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ld.b(i8*, i32)
+declare <8 x i16> @llvm.mips.ld.h(i8*, i32)
+declare <4 x i32> @llvm.mips.ld.w(i8*, i32)
+declare <2 x i64> @llvm.mips.ld.d(i8*, i32)
+declare void @llvm.mips.st.b(<16 x i8>, i8*, i32)
+declare void @llvm.mips.st.h(<8 x i16>, i8*, i32)
+declare void @llvm.mips.st.w(<4 x i32>, i8*, i32)
+declare void @llvm.mips.st.d(<2 x i64>, i8*, i32)
diff --git a/test/CodeGen/Mips/msa/msa-nooddspreg.ll b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
new file mode 100644
index 000000000000..7cfc66650e6b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mips -mcpu=mips32r5 -mattr=+fp64,+msa,+nooddspreg < %s | FileCheck %s
+
+; Test that the register allocator honours +nooddspreg and does not pick an odd
+; single precision subregister of an MSA register.
+
+@f1 = external global float
+
+@f2 = external global float
+
+@v3 = external global <4 x float>
+
+@d1 = external global double
+
+define void @test() {
+; CHECK-LABEL: test:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+ %0 = load float, float * @f1
+ %1 = insertelement <4 x float> undef, float %0, i32 0
+ %2 = insertelement <4 x float> %1, float %0, i32 1
+ %3 = insertelement <4 x float> %2, float %0, i32 2
+ %4 = insertelement <4 x float> %3, float %0, i32 3
+
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+ %5 = load float, float * @f2
+ %6 = insertelement <4 x float> undef, float %5, i32 0
+ %7 = insertelement <4 x float> %6, float %5, i32 1
+ %8 = insertelement <4 x float> %7, float %5, i32 2
+ %9 = insertelement <4 x float> %8, float %5, i32 3
+
+ %10 = fadd <4 x float> %4, %9
+ store <4 x float> %10, <4 x float> * @v3
+ ret void
+}
+
+; Test that the register allocator hnours +noodspreg and does not pick an odd
+; single precision register for a load to perform a conversion to a double.
+
+define void @test2() {
+; CHECK-LABEL: test2:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+ %0 = load float, float * @f1
+ %1 = fpext float %0 to double
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+ %2 = load float, float * @f2
+ %3 = fpext float %2 to double
+ %4 = fadd double %1, %3
+ store double%4, double * @d1
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll
index 9da26adc1511..d0a333d369ca 100644
--- a/test/CodeGen/NVPTX/fast-math.ll
+++ b/test/CodeGen/NVPTX/fast-math.ll
@@ -1,10 +1,8 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
declare float @llvm.nvvm.sqrt.f(float)
-
-; CHECK: sqrt_div
+; CHECK-LABEL: sqrt_div
; CHECK: sqrt.rn.f32
; CHECK: div.rn.f32
define float @sqrt_div(float %a, float %b) {
@@ -13,7 +11,7 @@ define float @sqrt_div(float %a, float %b) {
ret float %t2
}
-; CHECK: sqrt_div_fast
+; CHECK-LABEL: sqrt_div_fast
; CHECK: sqrt.approx.f32
; CHECK: div.approx.f32
define float @sqrt_div_fast(float %a, float %b) #0 {
@@ -22,22 +20,19 @@ define float @sqrt_div_fast(float %a, float %b) #0 {
ret float %t2
}
-
-; CHECK: fadd
-; CHECK: add.f32
+; CHECK-LABEL: fadd
+; CHECK: add.rn.f32
define float @fadd(float %a, float %b) {
%t1 = fadd float %a, %b
ret float %t1
}
-; CHECK: fadd_ftz
-; CHECK: add.ftz.f32
+; CHECK-LABEL: fadd_ftz
+; CHECK: add.rn.ftz.f32
define float @fadd_ftz(float %a, float %b) #1 {
%t1 = fadd float %a, %b
ret float %t1
}
-
-
attributes #0 = { "unsafe-fp-math" = "true" }
attributes #1 = { "nvptx-f32ftz" = "true" }
diff --git a/test/CodeGen/PowerPC/change-no-infs.ll b/test/CodeGen/PowerPC/change-no-infs.ll
new file mode 100644
index 000000000000..0cd5eb5408e3
--- /dev/null
+++ b/test/CodeGen/PowerPC/change-no-infs.ll
@@ -0,0 +1,67 @@
+; Check that we can enable/disable NoInfsFPMath and NoNaNsInFPMath via function
+; attributes. An attribute on one function should not magically apply to the
+; next one.
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN: -enable-no-infs-fp-math -enable-no-nans-fp-math \
+; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The fcmp+select in these functions should be converted to a fsel instruction
+; when both NoInfsFPMath and NoNaNsInFPMath are enabled.
+
+; CHECK-LABEL: default0:
+define double @default0(double %a, double %y, double %z) {
+entry:
+; SAFE-NOT: fsel
+; UNSAFE: fsel
+ %cmp = fcmp ult double %a, 0.000000e+00
+ %z.y = select i1 %cmp, double %z, double %y
+ ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_off:
+define double @unsafe_math_off(double %a, double %y, double %z) #0 #2 {
+entry:
+; SAFE-NOT: fsel
+; UNSAFE-NOT: fsel
+ %cmp = fcmp ult double %a, 0.000000e+00
+ %z.y = select i1 %cmp, double %z, double %y
+ ret double %z.y
+}
+
+; CHECK-LABEL: default1:
+define double @default1(double %a, double %y, double %z) {
+; SAFE-NOT: fsel
+; UNSAFE: fsel
+ %cmp = fcmp ult double %a, 0.000000e+00
+ %z.y = select i1 %cmp, double %z, double %y
+ ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_on:
+define double @unsafe_math_on(double %a, double %y, double %z) #1 #3 {
+entry:
+; SAFE-NOT: fsel
+; UNSAFE-NOT: fsel
+ %cmp = fcmp ult double %a, 0.000000e+00
+ %z.y = select i1 %cmp, double %z, double %y
+ ret double %z.y
+}
+
+; CHECK-LABEL: default2:
+define double @default2(double %a, double %y, double %z) {
+; SAFE-NOT: fsel
+; UNSAFE: fsel
+ %cmp = fcmp ult double %a, 0.000000e+00
+ %z.y = select i1 %cmp, double %z, double %y
+ ret double %z.y
+}
+
+attributes #0 = { "no-infs-fp-math"="false" }
+attributes #1 = { "no-nans-fp-math"="false" }
+
+attributes #2 = { "no-infs-fp-math"="false" }
+attributes #3 = { "no-infs-fp-math"="true" }
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index b61acab7f7cb..98862cd049a5 100644
--- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -23,7 +23,7 @@ entry:
; CHECK: mfvsrd [[TOGPR:[0-9]+]],
; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
; CHECK: extsw 3, [[RSHREG]]
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
; CHECK-P7-DAG: stxvw4x 34,
; CHECK-P7: lwax 3, [[ELEMOFFREG]],
; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
@@ -52,7 +52,7 @@ entry:
; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
; CHECK: mfvsrd 3,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28
; CHECK-P7-DAG: stxvd2x 34,
; CHECK-P7: ldx 3, [[ELEMOFFREG]],
; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
@@ -75,7 +75,7 @@ entry:
; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
; CHECK: xscvspdpn 1,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
; CHECK-P7-DAG: stxvw4x 34,
; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 49980da6eb8f..e4f8f3fb6ca9 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -7,11 +7,18 @@ target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: test:
; CHECK-NEXT: call .Lbitcast@FUNCTION{{$}}
+; CHECK-NEXT: call .Lbitcast@FUNCTION{{$}}
; CHECK-NEXT: call .Lbitcast.1@FUNCTION{{$}}
; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0
; CHECK-NEXT: call .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0
+; CHECK-NEXT: call .Lbitcast.2@FUNCTION, $pop[[L1]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 0
+; CHECK-NEXT: call .Lbitcast.2@FUNCTION, $pop[[L2]]{{$}}
+; CHECK-NEXT: call foo0@FUNCTION
; CHECK-NEXT: i32.call $drop=, .Lbitcast.3@FUNCTION{{$}}
; CHECK-NEXT: call foo2@FUNCTION{{$}}
+; CHECK-NEXT: call foo1@FUNCTION{{$}}
; CHECK-NEXT: call foo3@FUNCTION{{$}}
; CHECK-NEXT: .endfunc
@@ -47,10 +54,19 @@ declare void @foo3()
define void @test() {
entry:
call void bitcast (void (i32)* @has_i32_arg to void ()*)()
+ call void bitcast (void (i32)* @has_i32_arg to void ()*)()
call void bitcast (i32 ()* @has_i32_ret to void ()*)()
call void bitcast (void ()* @foo0 to void (i32)*)(i32 0)
+ %p = bitcast void ()* @foo0 to void (i32)*
+ call void %p(i32 0)
+ %q = bitcast void ()* @foo0 to void (i32)*
+ call void %q(i32 0)
+ %r = bitcast void (i32)* %q to void ()*
+ call void %r()
%t = call i32 bitcast (void ()* @foo1 to i32 ()*)()
call void bitcast (void ()* @foo2 to void ()*)()
+ call void @foo1()
call void @foo3()
+
ret void
}
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
deleted file mode 100644
index 5980b7907c9f..000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
-
-; Additional tests for 64-bit divide bypass
-
-define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
- %result = sdiv i64 %a, %b
- ret i64 %result
-}
-
-define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
- %result = srem i64 %a, %b
- ret i64 %result
-}
-
-define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: divw
-; CHECK: addq
-; CHECK: ret
-; CHECK-NOT: idivq
-; CHECK-NOT: divw
- %resultdiv = sdiv i64 %a, %b
- %resultrem = srem i64 %a, %b
- %result = add i64 %resultdiv, %resultrem
- ret i64 %result
-}
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
deleted file mode 100644
index 79001e5de192..000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-
-define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
- %result = sdiv i32 %a, %b
- ret i32 %result
-}
-
-define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
- %result = srem i32 %a, %b
- ret i32 %result
-}
-
-define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
-; CHECK-NOT: idivl
-; CHECK-NOT: divb
- %resultdiv = sdiv i32 %a, %b
- %resultrem = srem i32 %a, %b
- %result = add i32 %resultdiv, %resultrem
- ret i32 %result
-}
-
-define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_use_div_and_idiv:
-; CHECK: idivl
-; CHECK: divb
-; CHECK: divl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
- %resultidiv = sdiv i32 %a, %b
- %resultdiv = udiv i32 %a, %b
- %result = add i32 %resultidiv, %resultdiv
- ret i32 %result
-}
-
-define i32 @Test_use_div_imm_imm() nounwind {
-; CHECK-LABEL: Test_use_div_imm_imm:
-; CHECK: movl $64
- %resultdiv = sdiv i32 256, 4
- ret i32 %resultdiv
-}
-
-define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
- %resultdiv = sdiv i32 %a, 33
- ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
- %resultrem = srem i32 %a, 33
- ret i32 %resultrem
-}
-
-define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_divrem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
- %resultdiv = sdiv i32 %a, 33
- %resultrem = srem i32 %a, 33
- %result = add i32 %resultdiv, %resultrem
- ret i32 %result
-}
-
-define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
- %resultdiv = sdiv i32 4, %a
- ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
- %resultdiv = sdiv i32 4, %a
- ret i32 %resultdiv
-}
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
index dc1814b55cd3..9902325fd148 100644
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -176,4 +176,84 @@ entry:
ret i8 %tmp2
}
+define i8 @test_add_1_cmov_cmov(i64* %p, i8* %q) #0 {
+; TODO: It's possible to use "lock inc" here, but both cmovs need to be updated.
+; CHECK-LABEL: test_add_1_cmov_cmov:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+entry:
+ %add = atomicrmw add i64* %p, i64 1 seq_cst
+ %cmp = icmp slt i64 %add, 0
+ %s1 = select i1 %cmp, i8 12, i8 34
+ store i8 %s1, i8* %q
+ %s2 = select i1 %cmp, i8 56, i8 78
+ ret i8 %s2
+}
+
+define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_setcc_eq:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp eq i64 %tmp0, 1
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock addq $5, (%rdi)
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+ %tmp1 = icmp ne i64 %tmp0, -5
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne_comparand_mismatch(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne_comparand_mismatch:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $5, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+ %tmp1 = icmp ne i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+declare void @g()
+define zeroext i1 @test_sub_1_setcc_jcc(i64* %p) local_unnamed_addr #0 {
+; TODO: It's possible to use "lock dec" here, but both uses of the cmp need to
+; be updated.
+; CHECK-LABEL: test_sub_1_setcc_jcc:
+; CHECK: # BB#0: # %entry
+; CHECK: movq $-1, %rax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: cmpq $1, %rax
+; CHECK-NEXT: sete %bl
+; CHECK-NEXT: jne
+entry:
+ %add = atomicrmw volatile add i64* %p, i64 -1 seq_cst
+ %cmp = icmp ne i64 %add, 1
+ %not = xor i1 %cmp, true
+ br i1 %cmp, label %else, label %then
+then:
+ tail call void @g()
+ br label %else
+else:
+ ret i1 %not
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index c8e806890d07..a7cd8cf23984 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -62,6 +62,17 @@ define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
ret <8 x float> %a
}
+define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fptrunc01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %cvt = fptrunc double %ext to float
+ %res = insertelement <4 x float> %a1, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
define <4 x double> @fpext00(<4 x float> %b) nounwind {
; CHECK-LABEL: fpext00:
; CHECK: # BB#0:
@@ -71,6 +82,17 @@ define <4 x double> @fpext00(<4 x float> %b) nounwind {
ret <4 x double> %a
}
+define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fpext01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ %cvt = fpext float %ext to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcA:
; CHECK: # BB#0:
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 789ca2413940..c729b988cfb8 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -39,3 +39,29 @@ define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
%B = trunc <16 x i16> %A to <16 x i8>
ret <16 x i8> %B
}
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; CHECK-LABEL: usat_trunc_wb_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+ %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+ %x6 = trunc <16 x i16> %x5 to <16 x i8>
+ ret <16 x i8> %x6
+}
+
+define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
+; CHECK-LABEL: usat_trunc_dw_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %x3 = icmp ult <8 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+ %x5 = select <8 x i1> %x3, <8 x i32> %i, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+ %x6 = trunc <8 x i32> %x5 to <8 x i16>
+ ret <8 x i16> %x6
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 5e50a3aef2f2..87deeb9e16c0 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,6 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
; ALL-LABEL: sitof32:
@@ -12,255 +18,304 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
}
define <8 x double> @sltof864(<8 x i64> %a) {
-; KNL-LABEL: sltof864:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT: retq
+; NODQ-LABEL: sltof864:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: sltof864:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0
-; SKX-NEXT: retq
+; DQ-LABEL: sltof864:
+; DQ: ## BB#0:
+; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
+; DQ-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
define <4 x double> @sltof464(<4 x i64> %a) {
-; KNL-LABEL: sltof464:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; NODQ-LABEL: sltof464:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: sltof464:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sltof464:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0
-; SKX-NEXT: retq
+; AVX512DQ-LABEL: sltof464:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %b
}
define <2 x float> @sltof2f32(<2 x i64> %a) {
-; KNL-LABEL: sltof2f32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
-; KNL-NEXT: retq
+; NODQ-LABEL: sltof2f32:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
+; NODQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; NODQ-NEXT: retq
;
-; SKX-LABEL: sltof2f32:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtqq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VLDQ-LABEL: sltof2f32:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
+; VLDQ-NEXT: retq
+;
+; AVX512DQ-LABEL: sltof2f32:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: sltof4f32_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: vmovdqu (%rdi), %ymm0
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: sltof4f32_mem:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vmovdqu (%rdi), %ymm0
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: sltof4f32_mem:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sltof4f32_mem:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0
-; SKX-NEXT: retq
+; AVX512DQ-LABEL: sltof4f32_mem:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
+; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
define <4 x i64> @f64tosl(<4 x double> %a) {
-; KNL-LABEL: f64tosl:
-; KNL: ## BB#0:
-; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; KNL-NEXT: vcvttsd2si %xmm1, %rax
-; KNL-NEXT: vmovq %rax, %xmm2
-; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; KNL-NEXT: vcvttsd2si %xmm1, %rax
-; KNL-NEXT: vmovq %rax, %xmm1
-; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT: vcvttsd2si %xmm0, %rax
-; KNL-NEXT: vmovq %rax, %xmm2
-; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; KNL-NEXT: vcvttsd2si %xmm0, %rax
-; KNL-NEXT: vmovq %rax, %xmm0
-; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; NODQ-LABEL: f64tosl:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
+; NODQ-NEXT: vcvttsd2si %xmm1, %rax
+; NODQ-NEXT: vmovq %rax, %xmm2
+; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; NODQ-NEXT: vcvttsd2si %xmm1, %rax
+; NODQ-NEXT: vmovq %rax, %xmm1
+; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT: vcvttsd2si %xmm0, %rax
+; NODQ-NEXT: vmovq %rax, %xmm2
+; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NODQ-NEXT: vcvttsd2si %xmm0, %rax
+; NODQ-NEXT: vmovq %rax, %xmm0
+; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: f64tosl:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0
-; SKX-NEXT: retq
+; VLDQ-LABEL: f64tosl:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
+; VLDQ-NEXT: retq
+;
+; AVX512DQ-LABEL: f64tosl:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i64>
ret <4 x i64> %b
}
define <4 x i64> @f32tosl(<4 x float> %a) {
-; KNL-LABEL: f32tosl:
-; KNL: ## BB#0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; KNL-NEXT: vcvttss2si %xmm1, %rax
-; KNL-NEXT: vmovq %rax, %xmm1
-; KNL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; KNL-NEXT: vcvttss2si %xmm2, %rax
-; KNL-NEXT: vmovq %rax, %xmm2
-; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT: vcvttss2si %xmm0, %rax
-; KNL-NEXT: vmovq %rax, %xmm2
-; KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; KNL-NEXT: vcvttss2si %xmm0, %rax
-; KNL-NEXT: vmovq %rax, %xmm0
-; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; NODQ-LABEL: f32tosl:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; NODQ-NEXT: vcvttss2si %xmm1, %rax
+; NODQ-NEXT: vmovq %rax, %xmm1
+; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; NODQ-NEXT: vcvttss2si %xmm2, %rax
+; NODQ-NEXT: vmovq %rax, %xmm2
+; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT: vcvttss2si %xmm0, %rax
+; NODQ-NEXT: vmovq %rax, %xmm2
+; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NODQ-NEXT: vcvttss2si %xmm0, %rax
+; NODQ-NEXT: vmovq %rax, %xmm0
+; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: f32tosl:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: f32tosl:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvttps2qq %xmm0, %ymm0
-; SKX-NEXT: retq
+; AVX512DQ-LABEL: f32tosl:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: retq
%b = fptosi <4 x float> %a to <4 x i64>
ret <4 x i64> %b
}
define <4 x float> @sltof432(<4 x i64> %a) {
-; KNL-LABEL: sltof432:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: sltof432:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: sltof432:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sltof432:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0
-; SKX-NEXT: retq
+; AVX512DQ-LABEL: sltof432:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <4 x float> @ultof432(<4 x i64> %a) {
-; KNL-LABEL: ultof432:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: ultof432:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: retq
;
-; SKX-LABEL: ultof432:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; SKX-NEXT: retq
+; VLDQ-LABEL: ultof432:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: retq
+;
+; AVX512DQ-LABEL: ultof432:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <8 x double> @ultof64(<8 x i64> %a) {
-; KNL-LABEL: ultof64:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT: retq
+; NODQ-LABEL: ultof64:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: ultof64:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; SKX-NEXT: retq
+; DQ-LABEL: ultof64:
+; DQ: ## BB#0:
+; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
+; DQ-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
@@ -284,33 +339,33 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
}
define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
-; KNL-LABEL: fptoui_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: fptoui_256:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: fptoui_256:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvttps2udq %ymm0, %ymm0
-; SKX-NEXT: retq
+; VL-LABEL: fptoui_256:
+; VL: ## BB#0:
+; VL-NEXT: vcvttps2udq %ymm0, %ymm0
+; VL-NEXT: retq
%b = fptoui <8 x float> %a to <8 x i32>
ret <8 x i32> %b
}
define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; KNL-LABEL: fptoui_128:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: fptoui_128:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: fptoui_128:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvttps2udq %xmm0, %xmm0
-; SKX-NEXT: retq
+; VL-LABEL: fptoui_128:
+; VL: ## BB#0:
+; VL-NEXT: vcvttps2udq %xmm0, %xmm0
+; VL-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -325,17 +380,17 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
}
define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; KNL-LABEL: fptoui_256d:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: fptoui_256d:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: fptoui_256d:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0
-; SKX-NEXT: retq
+; VL-LABEL: fptoui_256d:
+; VL: ## BB#0:
+; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -349,34 +404,34 @@ define <8 x double> @sitof64(<8 x i32> %a) {
ret <8 x double> %b
}
define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: sitof64_mask:
-; KNL: ## BB#0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT: retq
+; NODQ-LABEL: sitof64_mask:
+; NODQ: ## BB#0:
+; NODQ-NEXT: kmovw %edi, %k1
+; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT: retq
;
-; SKX-LABEL: sitof64_mask:
-; SKX: ## BB#0:
-; SKX-NEXT: kmovb %edi, %k1
-; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT: retq
+; DQ-LABEL: sitof64_mask:
+; DQ: ## BB#0:
+; DQ-NEXT: kmovb %edi, %k1
+; DQ-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT: retq
%1 = bitcast i8 %c to <8 x i1>
%2 = sitofp <8 x i32> %b to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sitof64_maskz:
-; KNL: ## BB#0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; NODQ-LABEL: sitof64_maskz:
+; NODQ: ## BB#0:
+; NODQ-NEXT: kmovw %edi, %k1
+; NODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT: retq
;
-; SKX-LABEL: sitof64_maskz:
-; SKX: ## BB#0:
-; SKX-NEXT: kmovb %edi, %k1
-; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT: retq
+; DQ-LABEL: sitof64_maskz:
+; DQ: ## BB#0:
+; DQ-NEXT: kmovb %edi, %k1
+; DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT: retq
%1 = bitcast i8 %b to <8 x i1>
%2 = sitofp <8 x i32> %a to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -402,19 +457,19 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
}
define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
-; KNL-LABEL: fptrunc00:
-; KNL: ## BB#0:
-; KNL-NEXT: vcvtpd2ps %zmm0, %ymm0
-; KNL-NEXT: vcvtpd2ps %zmm1, %ymm1
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT: retq
+; NODQ-LABEL: fptrunc00:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vcvtpd2ps %zmm0, %ymm0
+; NODQ-NEXT: vcvtpd2ps %zmm1, %ymm1
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: fptrunc00:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0
-; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1
-; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT: retq
+; DQ-LABEL: fptrunc00:
+; DQ: ## BB#0:
+; DQ-NEXT: vcvtpd2ps %zmm0, %ymm0
+; DQ-NEXT: vcvtpd2ps %zmm1, %ymm1
+; DQ-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; DQ-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
@@ -429,25 +484,36 @@ define <4 x float> @fptrunc01(<4 x double> %b) {
}
define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: fptrunc02:
-; KNL: ## BB#0:
-; KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: fptrunc02:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: fptrunc02:
-; SKX: ## BB#0:
-; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; SKX-NEXT: retq
+; VL-LABEL: fptrunc02:
+; VL: ## BB#0:
+; VL-NEXT: vpslld $31, %xmm1, %xmm1
+; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
+define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fptrunc03:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
+; ALL-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %cvt = fptrunc double %ext to float
+ %res = insertelement <4 x float> %a1, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
define <8 x double> @fpext00(<8 x float> %b) nounwind {
; ALL-LABEL: fpext00:
; ALL: ## BB#0:
@@ -458,24 +524,35 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
}
define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
-; KNL-LABEL: fpext01:
-; KNL: ## BB#0:
-; KNL-NEXT: vcvtps2pd %xmm0, %ymm0
-; KNL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vandpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT: retq
+; NOVL-LABEL: fpext01:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0
+; NOVL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1
+; NOVL-NEXT: vandpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: fpext01:
-; SKX: ## BB#0:
-; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1
-; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; SKX-NEXT: retq
+; VL-LABEL: fpext01:
+; VL: ## BB#0:
+; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1
+; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; VL-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
%mask = fcmp ogt <4 x double>%a1, %b1
%c = select <4 x i1>%mask, <4 x double>%a, <4 x double>zeroinitializer
ret <4 x double> %c
}
+define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fpext02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ %cvt = fpext float %ext to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
define double @funcA(i64* nocapture %e) {
; ALL-LABEL: funcA:
; ALL: ## BB#0: ## %entry
@@ -589,53 +666,53 @@ define i32 @float_to_int(float %x) {
}
define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; KNL-LABEL: uitof64:
-; KNL: ## BB#0:
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm2
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm1
-; KNL-NEXT: vmovaps %zmm2, %zmm0
-; KNL-NEXT: retq
+; NODQ-LABEL: uitof64:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm2
+; NODQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm1
+; NODQ-NEXT: vmovaps %zmm2, %zmm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: uitof64:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2
-; SKX-NEXT: vextracti32x8 $1, %zmm0, %ymm0
-; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1
-; SKX-NEXT: vmovaps %zmm2, %zmm0
-; SKX-NEXT: retq
+; DQ-LABEL: uitof64:
+; DQ: ## BB#0:
+; DQ-NEXT: vcvtudq2pd %ymm0, %zmm2
+; DQ-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; DQ-NEXT: vcvtudq2pd %ymm0, %zmm1
+; DQ-NEXT: vmovaps %zmm2, %zmm0
+; DQ-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}
define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uitof64_mask:
-; KNL: ## BB#0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT: retq
+; NODQ-LABEL: uitof64_mask:
+; NODQ: ## BB#0:
+; NODQ-NEXT: kmovw %edi, %k1
+; NODQ-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT: retq
;
-; SKX-LABEL: uitof64_mask:
-; SKX: ## BB#0:
-; SKX-NEXT: kmovb %edi, %k1
-; SKX-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT: retq
+; DQ-LABEL: uitof64_mask:
+; DQ: ## BB#0:
+; DQ-NEXT: kmovb %edi, %k1
+; DQ-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT: retq
%1 = bitcast i8 %c to <8 x i1>
%2 = uitofp <8 x i32> %b to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uitof64_maskz:
-; KNL: ## BB#0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; NODQ-LABEL: uitof64_maskz:
+; NODQ: ## BB#0:
+; NODQ-NEXT: kmovw %edi, %k1
+; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT: retq
;
-; SKX-LABEL: uitof64_maskz:
-; SKX: ## BB#0:
-; SKX-NEXT: kmovb %edi, %k1
-; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT: retq
+; DQ-LABEL: uitof64_maskz:
+; DQ: ## BB#0:
+; DQ-NEXT: kmovb %edi, %k1
+; DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT: retq
%1 = bitcast i8 %b to <8 x i1>
%2 = uitofp <8 x i32> %a to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -643,17 +720,17 @@ define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
}
define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof64_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: uitof64_256:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitof64_256:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT: retq
+; VL-LABEL: uitof64_256:
+; VL: ## BB#0:
+; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x double>
ret <4 x double> %b
}
@@ -668,33 +745,33 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
}
define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: uitof32_256:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitof32_256:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT: retq
+; VL-LABEL: uitof32_256:
+; VL: ## BB#0:
+; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x float>
ret <8 x float> %b
}
define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_128:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: uitof32_128:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitof32_128:
-; SKX: ## BB#0:
-; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VL-LABEL: uitof32_128:
+; VL: ## BB#0:
+; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
@@ -736,21 +813,21 @@ define double @uitofp03(i32 %a) nounwind {
}
define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
-; KNL-LABEL: sitofp_16i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; KNL-NEXT: retq
+; NODQ-LABEL: sitofp_16i1_float:
+; NODQ: ## BB#0:
+; NODQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
+; NODQ-NEXT: retq
;
-; SKX-LABEL: sitofp_16i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %zmm0
-; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0
-; SKX-NEXT: retq
+; DQ-LABEL: sitofp_16i1_float:
+; DQ: ## BB#0:
+; DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; DQ-NEXT: vpmovm2d %k0, %zmm0
+; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0
+; DQ-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = sitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
@@ -799,157 +876,259 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
}
define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
-; KNL-LABEL: sitofp_16i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vcmpltpd %zmm1, %zmm2, %k1
-; KNL-NEXT: vcmpltpd %zmm0, %zmm2, %k2
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm1, %ymm1
-; KNL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; KNL-NEXT: retq
+; NOVLDQ-LABEL: sitofp_16i1_double:
+; NOVLDQ: ## BB#0:
+; NOVLDQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
+; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
+; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NOVLDQ-NEXT: vpmovqd %zmm1, %ymm1
+; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
+; NOVLDQ-NEXT: retq
+;
+; VLDQ-LABEL: sitofp_16i1_double:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorpd %zmm2, %zmm2, %zmm2
+; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
+; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
+; VLDQ-NEXT: vpmovm2d %k1, %ymm0
+; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT: vpmovm2d %k0, %ymm1
+; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: sitofp_16i1_double:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
+; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
+; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k2} {z}
+; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
+; VLNODQ-NEXT: retq
;
-; SKX-LABEL: sitofp_16i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorpd %zmm2, %zmm2, %zmm2
-; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0
-; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1
-; SKX-NEXT: vpmovm2d %k1, %ymm0
-; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT: vpmovm2d %k0, %ymm1
-; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1
-; SKX-NEXT: retq
+; AVX512DQ-LABEL: sitofp_16i1_double:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vxorpd %zmm2, %zmm2, %zmm2
+; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
+; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT: vpmovm2q %k1, %zmm0
+; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1
+; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1
+; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <16 x double> %a, zeroinitializer
%1 = sitofp <16 x i1> %cmpres to <16 x double>
ret <16 x double> %1
}
define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
-; KNL-LABEL: sitofp_8i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT: retq
+; NOVLDQ-LABEL: sitofp_8i1_double:
+; NOVLDQ: ## BB#0:
+; NOVLDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT: retq
;
-; SKX-LABEL: sitofp_8i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorpd %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %ymm0
-; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT: retq
+; VLDQ-LABEL: sitofp_8i1_double:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1
+; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %ymm0
+; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: sitofp_8i1_double:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT: retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_double:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1
+; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <8 x double> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x double>
ret <8 x double> %1
}
define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
-; KNL-LABEL: sitofp_8i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtdq2ps %ymm0, %ymm0
-; KNL-NEXT: retq
+; NOVLDQ-LABEL: sitofp_8i1_float:
+; NOVLDQ: ## BB#0:
+; NOVLDQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVLDQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
+; NOVLDQ-NEXT: retq
+;
+; VLDQ-LABEL: sitofp_8i1_float:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %ymm0
+; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sitofp_8i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %ymm0
-; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; SKX-NEXT: retq
+; VLNODQ-LABEL: sitofp_8i1_float:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0
+; VLNODQ-NEXT: retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_float:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <8 x float> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x float>
ret <8 x float> %1
}
define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
-; KNL-LABEL: sitofp_4i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: sitofp_4i1_float:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VLDQ-LABEL: sitofp_4i1_float:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %xmm0
+; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sitofp_4i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm0
-; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VLNODQ-LABEL: sitofp_4i1_float:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <4 x float> %a, zeroinitializer
%1 = sitofp <4 x i1> %cmpres to <4 x float>
ret <4 x float> %1
}
define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
-; KNL-LABEL: sitofp_4i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT: retq
+; NOVL-LABEL: sitofp_4i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT: vpmovqd %zmm0, %ymm0
+; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT: retq
+;
+; VLDQ-LABEL: sitofp_4i1_double:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %xmm0
+; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sitofp_4i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; SKX-NEXT: vcmpltpd %ymm0, %ymm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm0
-; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; SKX-NEXT: retq
+; VLNODQ-LABEL: sitofp_4i1_double:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0
+; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <4 x double> %a, zeroinitializer
%1 = sitofp <4 x i1> %cmpres to <4 x double>
ret <4 x double> %1
}
define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
-; KNL-LABEL: sitofp_2i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: sitofp_2i1_float:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
+; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: sitofp_2i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm0
-; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VLDQ-LABEL: sitofp_2i1_float:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %xmm0
+; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: sitofp_2i1_float:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <2 x float> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x float>
ret <2 x float> %1
}
define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
-; KNL-LABEL: sitofp_2i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: sitofp_2i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VLDQ-LABEL: sitofp_2i1_double:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT: vpmovm2q %k0, %xmm0
+; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: sitofp_2i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0
-; SKX-NEXT: vpmovm2q %k0, %xmm0
-; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0
-; SKX-NEXT: retq
+; VLNODQ-LABEL: sitofp_2i1_double:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
ret <2 x double> %1
@@ -989,174 +1168,187 @@ define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
}
define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
-; KNL-LABEL: uitofp_16i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: movq {{.*}}(%rip), %rax
-; KNL-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT: kshiftrw $8, %k1, %k1
-; KNL-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm1, %ymm1
-; KNL-NEXT: vcvtudq2pd %ymm1, %zmm1
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_16i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT: movq {{.*}}(%rip), %rax
+; NOVL-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
+; NOVL-NEXT: vpmovqd %zmm0, %ymm0
+; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: kshiftrw $8, %k1, %k1
+; NOVL-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z}
+; NOVL-NEXT: vpmovqd %zmm1, %ymm1
+; NOVL-NEXT: vcvtudq2pd %ymm1, %zmm1
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_16i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; SKX-NEXT: movl {{.*}}(%rip), %eax
-; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT: kshiftrw $8, %k1, %k1
-; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z}
-; SKX-NEXT: vcvtudq2pd %ymm1, %zmm1
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_16i1_double:
+; VL: ## BB#0:
+; VL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; VL-NEXT: movl {{.*}}(%rip), %eax
+; VL-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z}
+; VL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT: kshiftrw $8, %k1, %k1
+; VL-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z}
+; VL-NEXT: vcvtudq2pd %ymm1, %zmm1
+; VL-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = uitofp <16 x i1> %mask to <16 x double>
ret <16 x double> %1
}
define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_8i1_float:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT: vpmovqd %zmm0, %ymm0
+; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_8i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_8i1_float:
+; VL: ## BB#0:
+; VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x float>
ret <8 x float> %1
}
define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_8i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT: vpmovqd %zmm0, %ymm0
+; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_8i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_8i1_double:
+; VL: ## BB#0:
+; VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x double>
ret <8 x double> %1
}
define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_4i1_float:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_4i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_4i1_float:
+; VL: ## BB#0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x float>
ret <4 x float> %1
}
define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpsrld $31, %xmm0, %xmm0
-; KNL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_4i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpsrld $31, %xmm0, %xmm0
+; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_4i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_4i1_double:
+; VL: ## BB#0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x double>
ret <4 x double> %1
}
define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_float:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_2i1_float:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpextrq $1, %xmm0, %rax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; NOVL-NEXT: vmovq %xmm0, %rax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NOVL-NEXT: retq
;
-; SKX-LABEL: uitofp_2i1_float:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT: retq
+; VL-LABEL: uitofp_2i1_float:
+; VL: ## BB#0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
}
define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_double:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: uitofp_2i1_double:
+; NOVL: ## BB#0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VLDQ-LABEL: uitofp_2i1_double:
+; VLDQ: ## BB#0:
+; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0
+; VLDQ-NEXT: retq
;
-; SKX-LABEL: uitofp_2i1_double:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT: vcvtuqq2pd %xmm0, %xmm0
-; SKX-NEXT: retq
+; VLNODQ-LABEL: uitofp_2i1_double:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
ret <2 x double> %1
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 42579377ef39..3f427298c177 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -179,3 +179,22 @@ define float @pr30561_f32(float %b, float %a, i1 %c) {
%cond = select i1 %c, float %a, float %b
ret float %cond
}
+
+define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind {
+; CHECK-LABEL: pr31515:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
+; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT: vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpmovdw %zmm0, %ymm0
+; CHECK-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %mask = and <16 x i1> %a, %b
+ %res = select <16 x i1> %mask, <16 x i16> zeroinitializer, <16 x i16> %c
+ ret <16 x i16> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 04d21ecd3e82..fb6c55b26e7c 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -505,9 +505,8 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
; KNL-LABEL: usat_trunc_wb_256_mem:
; KNL: ## BB#0:
-; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqu %xmm0, (%rdi)
; KNL-NEXT: retq
;
@@ -525,9 +524,8 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
; KNL-LABEL: usat_trunc_wb_256:
; KNL: ## BB#0:
-; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256:
@@ -607,3 +605,103 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
ret void
}
+define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
+; KNL-LABEL: usat_trunc_db_1024:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovusdb %zmm0, %xmm0
+; KNL-NEXT: vpmovusdb %zmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: usat_trunc_db_1024:
+; SKX: ## BB#0:
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT: vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT: vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT: vpmovdw %zmm0, %ymm0
+; SKX-NEXT: vpmovdw %zmm1, %ymm1
+; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT: vpmovwb %zmm0, %ymm0
+; SKX-NEXT: retq
+ %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+ %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+ %x6 = trunc <32 x i32> %x5 to <32 x i8>
+ ret <32 x i8> %x6
+}
+
+define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
+; KNL-LABEL: usat_trunc_db_1024_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovusdb %zmm0, %xmm0
+; KNL-NEXT: vpmovusdb %zmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vmovdqu %ymm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: usat_trunc_db_1024_mem:
+; SKX: ## BB#0:
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT: vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT: vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT: vpmovdw %zmm0, %ymm0
+; SKX-NEXT: vpmovdw %zmm1, %ymm1
+; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT: vpmovwb %zmm0, (%rdi)
+; SKX-NEXT: retq
+ %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+ %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+ %x6 = trunc <32 x i32> %x5 to <32 x i8>
+ store <32 x i8>%x6, <32 x i8>* %p, align 1
+ ret void
+}
+
+define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
+; ALL-LABEL: usat_trunc_dw_512:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovusdw %zmm0, %ymm0
+; ALL-NEXT: retq
+ %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+ %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+ %x6 = trunc <16 x i32> %x5 to <16 x i16>
+ ret <16 x i16> %x6
+}
+
+define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
+; ALL-LABEL: usat_trunc_wb_128:
+; ALL: ## BB#0:
+; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT: retq
+ %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+ %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+ %x6 = trunc <8 x i16> %x5 to <8 x i8>
+ ret <8 x i8>%x6
+}
+
+define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
+; KNL-LABEL: usat_trunc_qw_1024:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
+; KNL-NEXT: vpminuq %zmm2, %zmm1, %zmm1
+; KNL-NEXT: vpminuq %zmm2, %zmm0, %zmm0
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: vpmovqd %zmm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: usat_trunc_qw_1024:
+; SKX: ## BB#0:
+; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
+; SKX-NEXT: vpminuq %zmm2, %zmm1, %zmm1
+; SKX-NEXT: vpminuq %zmm2, %zmm0, %zmm0
+; SKX-NEXT: vpmovqd %zmm0, %ymm0
+; SKX-NEXT: vpmovqd %zmm1, %ymm1
+; SKX-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT: vpmovdw %zmm0, %ymm0
+; SKX-NEXT: retq
+ %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+ %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+ %x6 = trunc <16 x i64> %x5 to <16 x i16>
+ ret <16 x i16> %x6
+}
+
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
new file mode 100644
index 000000000000..ea545d22385c
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 32-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivl-to-divb -mtriple=i686-linux | FileCheck %s
+
+define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: retl
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: retl
+ %result = sdiv i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je .LBB1_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retl
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: retl
+ %result = srem i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je .LBB2_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retl
+; CHECK-NEXT: .LBB2_1:
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movzbl %ah, %edx # NOREX
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 %a, %b
+ %resultrem = srem i32 %a, %b
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_use_div_and_idiv:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl %ecx, %edi
+; CHECK-NEXT: orl %ebx, %edi
+; CHECK-NEXT: testl $-256, %edi
+; CHECK-NEXT: je .LBB3_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ebx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: testl $-256, %edi
+; CHECK-NEXT: jne .LBB3_5
+; CHECK-NEXT: jmp .LBB3_4
+; CHECK-NEXT: .LBB3_1:
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %bl
+; CHECK-NEXT: movzbl %al, %esi
+; CHECK-NEXT: testl $-256, %edi
+; CHECK-NEXT: je .LBB3_4
+; CHECK-NEXT: .LBB3_5:
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: jmp .LBB3_6
+; CHECK-NEXT: .LBB3_4:
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %bl
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: .LBB3_6:
+; CHECK-NEXT: addl %eax, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
+ %resultidiv = sdiv i32 %a, %b
+ %resultdiv = udiv i32 %a, %b
+ %result = add i32 %resultidiv, %resultdiv
+ ret i32 %result
+}
+
+define i32 @Test_use_div_imm_imm() nounwind {
+; CHECK-LABEL: Test_use_div_imm_imm:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $64, %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 256, 4
+ ret i32 %resultdiv
+}
+
+define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_reg_imm:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1041204193, %eax # imm = 0x3E0F83E1
+; CHECK-NEXT: imull {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $3, %edx
+; CHECK-NEXT: leal (%edx,%eax), %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 %a, 33
+ ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_reg_imm:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $3, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $5, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+ %resultrem = srem i32 %a, 33
+ ret i32 %resultrem
+}
+
+define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_divrem_reg_imm:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $3, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $5, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 %a, 33
+ %resultrem = srem i32 %a, 33
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_imm_reg:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: testl $-256, %ecx
+; CHECK-NEXT: je .LBB8_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movl $4, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: retl
+; CHECK-NEXT: .LBB8_1:
+; CHECK-NEXT: movb $4, %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_imm_reg:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: testl $-256, %ecx
+; CHECK-NEXT: je .LBB9_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movl $4, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: retl
+; CHECK-NEXT: .LBB9_1:
+; CHECK-NEXT: movb $4, %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: retl
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
new file mode 100644
index 000000000000..b067f9e1503c
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 64-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivq-to-divl -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Additional tests for 64-bit divide bypass
+
+define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK: # BB#0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: orq %rsi, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rsi
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT: retq
+ %result = sdiv i64 %a, %b
+ ret i64 %result
+}
+
+define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK: # BB#0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: orq %rsi, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: je .LBB1_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rsi
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
+ %result = srem i64 %a, %b
+ ret i64 %result
+}
+
+define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK: # BB#0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: orq %rsi, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: je .LBB2_1
+; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rsi
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB2_1:
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+ %resultdiv = sdiv i64 %a, %b
+ %resultrem = srem i64 %a, %b
+ %result = add i64 %resultdiv, %resultrem
+ ret i64 %result
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll
new file mode 100644
index 000000000000..b6a53130cf23
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -0,0 +1,55 @@
+; Check that a division is bypassed when appropriate only.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom < %s | FileCheck -check-prefixes=ATOM,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=REST,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck -check-prefixes=REST,CHECK %s
+
+; Verify that div32 is bypassed only for Atoms.
+define i32 @div32(i32 %a, i32 %b) {
+entry:
+; ATOM-LABEL: div32:
+; ATOM: orl %{{.*}}, [[REG:%[a-z]+]]
+; ATOM: testl $-256, [[REG]]
+; ATOM: divb
+;
+; REST-LABEL: div32:
+; REST-NOT: divb
+;
+ %div = sdiv i32 %a, %b
+ ret i32 %div
+}
+
+; Verify that div64 is always bypassed.
+define i64 @div64(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: div64:
+; CHECK: orq %{{.*}}, [[REG:%[a-z]+]]
+; CHECK: shrq $32, [[REG]]
+; CHECK: divl
+;
+ %div = sdiv i64 %a, %b
+ ret i64 %div
+}
+
+
+; Verify that no extra code is generated when optimizing for size.
+
+define i64 @div64_optsize(i64 %a, i64 %b) optsize {
+; CHECK-LABEL: div64_optsize:
+; CHECK-NOT: divl
+ %div = sdiv i64 %a, %b
+ ret i64 %div
+}
+
+define i32 @div32_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: div32_optsize:
+; CHECK-NOT: divb
+ %div = sdiv i32 %a, %b
+ ret i32 %div
+}
+
+define i32 @div32_minsize(i32 %a, i32 %b) minsize {
+; CHECK-LABEL: div32_minsize:
+; CHECK-NOT: divb
+ %div = sdiv i32 %a, %b
+ ret i32 %div
+}
diff --git a/test/CodeGen/X86/change-unsafe-fp-math.ll b/test/CodeGen/X86/change-unsafe-fp-math.ll
new file mode 100644
index 000000000000..33a7ec9bfc79
--- /dev/null
+++ b/test/CodeGen/X86/change-unsafe-fp-math.ll
@@ -0,0 +1,56 @@
+; Check that we can enable/disable UnsafeFPMath via function attributes. An
+; attribute on one function should not magically apply to the next one.
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown \
+; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math \
+; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The div in these functions should be converted to a mul when unsafe-fp-math
+; is enabled.
+
+; CHECK-LABEL: unsafe_fp_math_default0:
+define double @unsafe_fp_math_default0(double %x) {
+; SAFE: divsd
+; UNSAFE: mulsd
+ %div = fdiv double %x, 2.0
+ ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_off:
+define double @unsafe_fp_math_off(double %x) #0 {
+; SAFE: divsd
+; UNSAFE: divsd
+ %div = fdiv double %x, 2.0
+ ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default1:
+define double @unsafe_fp_math_default1(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE: divsd
+; UNSAFE: mulsd
+ %div = fdiv double %x, 2.0
+ ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_on:
+define double @unsafe_fp_math_on(double %x) #1 {
+; SAFE: mulsd
+; UNSAFE: mulsd
+ %div = fdiv double %x, 2.0
+ ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default2:
+define double @unsafe_fp_math_default2(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE: divsd
+; UNSAFE: mulsd
+ %div = fdiv double %x, 2.0
+ ret double %div
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index d24f27ddf22c..5d05c699f431 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -281,4 +281,54 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
; CHECK: setne
; CHECK: testl
; CHECK: setne
-} \ No newline at end of file
+}
+
+define i32 @test21(i64 %val) {
+ %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+ %cmp = icmp ne i64 %and, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+
+; CHECK-LABEL: test21
+; CHECK: shrq $41, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
+
+; AND-to-SHR transformation is enabled for eq/ne condition codes only.
+define i32 @test22(i64 %val) {
+ %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+ %cmp = icmp ult i64 %and, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+
+; CHECK-LABEL: test22
+; CHECK-NOT: shrq $41
+; CHECK: retq
+}
+
+define i32 @test23(i64 %val) {
+ %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+ %cmp = icmp ne i64 %and, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+
+; CHECK-LABEL: test23
+; CHECK: testq $-1048576, %rdi
+; CHECK: setne %al
+; CHECK: retq
+}
+
+define i32 @test24(i64 %val) {
+ %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+ %cmp = icmp ne i64 %and, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+
+; CHECK-LABEL: test24
+; CHECK: shlq $16, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll
index ee1f7bb5295b..20ce932a184b 100644
--- a/test/CodeGen/X86/cpus.ll
+++ b/test/CodeGen/X86/cpus.ll
@@ -33,3 +33,4 @@
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 13448a13ab4c..8c12e7148aa7 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -404,6 +404,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i8_var:
; SSE: # BB#0:
+; SSE-NEXT: andl $15, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movb (%rdi,%rax), %al
@@ -411,6 +412,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
;
; AVX-LABEL: extractelement_v16i8_var:
; AVX: # BB#0:
+; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movb (%rdi,%rax), %al
@@ -426,6 +428,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: andl $31, %edi
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movq %rsp, %rax
@@ -440,6 +443,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: andl $31, %edi
; AVX-NEXT: vmovaps %ymm0, (%rsp)
; AVX-NEXT: movq %rsp, %rax
; AVX-NEXT: movb (%rdi,%rax), %al
@@ -454,12 +458,14 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i16_var:
; SSE: # BB#0:
+; SSE-NEXT: andl $7, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i16_var:
; AVX: # BB#0:
+; AVX-NEXT: andl $7, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; AVX-NEXT: retq
@@ -474,6 +480,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: andl $15, %edi
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax
@@ -487,6 +494,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %ymm0, (%rsp)
; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax
; AVX-NEXT: movq %rbp, %rsp
@@ -500,12 +508,14 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i32_var:
; SSE: # BB#0:
+; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i32_var:
; AVX: # BB#0:
+; AVX-NEXT: andl $3, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax
; AVX-NEXT: retq
@@ -520,6 +530,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: andl $7, %edi
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movl (%rsp,%rdi,4), %eax
@@ -533,6 +544,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: movl (%rsp,%rdi,4), %eax
; AVX1-NEXT: movq %rbp, %rsp
@@ -554,12 +566,14 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v2i64_var:
; SSE: # BB#0:
+; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v2i64_var:
; AVX: # BB#0:
+; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax
; AVX-NEXT: retq
@@ -574,6 +588,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movq (%rsp,%rdi,8), %rax
@@ -587,6 +602,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: andl $3, %edi
; AVX-NEXT: vmovaps %ymm0, (%rsp)
; AVX-NEXT: movq (%rsp,%rdi,8), %rax
; AVX-NEXT: movq %rbp, %rsp
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 946516c8a46d..c418e67ecb67 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -16,11 +16,11 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
; CHECK-NEXT: movl 20(%esp), %edx
; CHECK-NEXT: paddd (%edx), %xmm0
; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl (%ecx,%edx), %esi
-; CHECK-NEXT: movl 12(%ecx,%edx), %edi
-; CHECK-NEXT: movl 8(%ecx,%edx), %ebx
-; CHECK-NEXT: movl 4(%ecx,%edx), %edx
+; CHECK-NEXT: movl (%edx), %esi
+; CHECK-NEXT: movl 12(%edx), %edi
+; CHECK-NEXT: movl 8(%edx), %ebx
+; CHECK-NEXT: movl 4(%edx), %edx
+; CHECK-NEXT: shll $4, %ecx
; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
; CHECK-NEXT: movl %edx, (%eax,%ecx)
; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index 1fa752774251..7b1926da245c 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -68,9 +68,10 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) {
; X32-LABEL: PR23476:
+; X32: andl $7, %eax
; X32: movsd {{.*#+}} xmm0 = mem[0],zero
; X32: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: movsd %xmm0, (%ecx)
%ext = extractelement <5 x i64> %in, i32 %index
store i64 %ext, i64* %out, align 8
ret void
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index 39f5242a8477..81351511374c 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -319,7 +319,7 @@ liveins:
- { reg: '%rsi' }
# CHECK: bb.0.entry:
# CHECK: %rbx = MOV64rr %rdx
-# CHECK-NEXT: %rdi = FAULTING_LOAD_OP %bb.3.is_null, 260, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: %rdi = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
body: |
bb.0.entry:
diff --git a/test/CodeGen/X86/lzcnt-zext-cmp.ll b/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 6f4cb84a2b9c..c69dbf573f46 100644
--- a/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -3,6 +3,8 @@
; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt))
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
; Test one 32-bit input, output is 32-bit, no transformations expected.
define i32 @test_zext_cmp0(i32 %a) {
diff --git a/test/CodeGen/X86/peephole.mir b/test/CodeGen/X86/peephole.mir
new file mode 100644
index 000000000000..6391836e9ca2
--- /dev/null
+++ b/test/CodeGen/X86/peephole.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+ define void @func() { ret void }
+...
+---
+# Check that instructions with MI.isBitcast() are only replaced by COPY if there
+# are no SUBREG_TO_REG users.
+# CHECK-LABEL: name: func
+name: func
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: fr32 }
+ - { id: 2, class: gr32 }
+
+ - { id: 3, class: gr32 }
+ - { id: 4, class: fr32 }
+ - { id: 5, class: gr32 }
+ - { id: 6, class: gr64 }
+
+body: |
+ bb.0:
+ ; CHECK: %1 = VMOVDI2SSrr %0
+ ; CHECK: %7 = COPY %0
+ ; CHECK: NOOP implicit %7
+ %0 = MOV32ri 42
+ %1 = VMOVDI2SSrr %0
+ %2 = MOVSS2DIrr %1
+ NOOP implicit %2
+
+ ; CHECK: %4 = VMOVDI2SSrr %3
+ ; CHECK-NOT: COPY
+ ; CHECK: %5 = MOVSS2DIrr %4
+ ; CHECK: %6 = SUBREG_TO_REG %5, 0
+ ; CHECK: NOOP implicit %6
+ %3 = MOV32ri 42
+ %4 = VMOVDI2SSrr %3
+ %5 = MOVSS2DIrr %4
+ %6 = SUBREG_TO_REG %5, 0, %subreg.sub_32bit
+ NOOP implicit %6
+...
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll
deleted file mode 100644
index 82928521ac2b..000000000000
--- a/test/CodeGen/X86/slow-div.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
-
-define i32 @div32(i32 %a, i32 %b) {
-entry:
-; DIV32-LABEL: div32:
-; DIV32: orl %{{.*}}, [[REG:%[a-z]+]]
-; DIV32: testl $-256, [[REG]]
-; DIV32: divb
-; DIV64-LABEL: div32:
-; DIV64-NOT: divb
- %div = sdiv i32 %a, %b
- ret i32 %div
-}
-
-define i64 @div64(i64 %a, i64 %b) {
-entry:
-; DIV32-LABEL: div64:
-; DIV32-NOT: divw
-; DIV64-LABEL: div64:
-; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
-; DIV64: testq $-65536, [[REG]]
-; DIV64: divw
- %div = sdiv i64 %a, %b
- ret i64 %div
-}
-
-; Verify that no extra code is generated when optimizing for size.
-
-define i32 @div32_optsize(i32 %a, i32 %b) optsize {
-; DIV32-LABEL: div32_optsize:
-; DIV32-NOT: divb
- %div = sdiv i32 %a, %b
- ret i32 %div
-}
-
-define i32 @div32_minsize(i32 %a, i32 %b) minsize {
-; DIV32-LABEL: div32_minsize:
-; DIV32-NOT: divb
- %div = sdiv i32 %a, %b
- ret i32 %div
-}
-
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 41e9a95bcdd8..8251eb324a77 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -46,6 +46,7 @@
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2 2>&1 | FileCheck %s --check-prefix=FAST
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3 2>&1 | FileCheck %s --check-prefix=FAST
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=FAST
; Other chips with slow unaligned memory accesses
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 4af9758f122d..972a33f13cd0 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1257,15 +1257,12 @@ define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_sd:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: cvtsi2sdl %eax, %xmm1
-; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_sd:
; X64: # BB#0:
-; X64-NEXT: cvtsi2sdl %edi, %xmm1
-; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: cvtsi2sdl %edi, %xmm0
; X64-NEXT: retq
%cvt = sitofp i32 %a1 to double
%res = insertelement <2 x double> %a0, double %cvt, i32 0
@@ -1293,14 +1290,12 @@ define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cvtss_sd:
; X32: # BB#0:
-; X32-NEXT: cvtss2sd %xmm1, %xmm1
-; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: cvtss2sd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtss_sd:
; X64: # BB#0:
-; X64-NEXT: cvtss2sd %xmm1, %xmm1
-; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: cvtss2sd %xmm1, %xmm0
; X64-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
%cvt = fpext float %ext to double
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 85c7875d923b..1dc8b7abd207 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -12,6 +12,7 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
+; X32-NEXT: andl $3, %eax
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movl $76, (%esp,%eax,4)
; X32-NEXT: movl (%esp), %eax
@@ -21,9 +22,10 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
;
; X64-LABEL: t0:
; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: movl $76, -24(%rsp,%rax,4)
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: movl $76, -24(%rsp,%rdi,4)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7
@@ -38,6 +40,7 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
+; X32-NEXT: andl $3, %eax
; X32-NEXT: movl $76, %ecx
; X32-NEXT: pinsrd $0, %ecx, %xmm0
; X32-NEXT: movdqa %xmm0, (%esp)
@@ -48,11 +51,12 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
;
; X64-LABEL: t1:
; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movl $76, %eax
; X64-NEXT: pinsrd $0, %eax, %xmm0
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: movl -24(%rsp,%rdi,4), %eax
; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 0
%t9 = extractelement <4 x i32> %t13, i32 %t7
@@ -66,6 +70,7 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
+; X32-NEXT: andl $3, %eax
; X32-NEXT: movdqa %xmm0, (%esp)
; X32-NEXT: pinsrd $0, (%esp,%eax,4), %xmm0
; X32-NEXT: movl %ebp, %esp
@@ -74,9 +79,10 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
;
; X64-LABEL: t2:
; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: pinsrd $0, -24(%rsp,%rax,4), %xmm0
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: pinsrd $0, -24(%rsp,%rdi,4), %xmm0
; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 %t7
%t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0
@@ -90,6 +96,7 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
+; X32-NEXT: andl $3, %eax
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movss %xmm0, (%esp,%eax,4)
; X32-NEXT: movaps (%esp), %xmm0
@@ -99,9 +106,10 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
;
; X64-LABEL: t3:
; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: movss %xmm0, -24(%rsp,%rax,4)
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: movss %xmm0, -24(%rsp,%rdi,4)
; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 0
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index c847ac983003..82627c54e663 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -10,6 +10,7 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X32-NEXT: andl $-32, %esp
; X32-NEXT: subl $64, %esp
; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: andl $7, %eax
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
@@ -25,10 +26,11 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $64, %rsp
+; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movaps %xmm0, (%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000
+; X64-NEXT: andl $7, %edi
+; X64-NEXT: movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
; X64-NEXT: movaps (%rsp), %xmm0
; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
; X64-NEXT: movq %rbp, %rsp
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index d612e7eb10d3..4074b6d32353 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -11,10 +11,11 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl 12(%ebp), %ecx
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: andl $3, %eax
+; X32-NEXT: movl 8(%ebp), %ecx
; X32-NEXT: movaps %xmm0, (%esp)
-; X32-NEXT: movl %eax, (%esp,%ecx,4)
+; X32-NEXT: movl %ecx, (%esp,%eax,4)
; X32-NEXT: movaps (%esp), %xmm0
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
@@ -22,9 +23,10 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
;
; X64-LABEL: var_insert:
; X64: # BB#0: # %entry
+; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %esi, %rax
-; X64-NEXT: movl %edi, -24(%rsp,%rax,4)
+; X64-NEXT: andl $3, %esi
+; X64-NEXT: movl %edi, -24(%rsp,%rsi,4)
; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
@@ -40,6 +42,7 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $32, %esp
; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: andl $3, %eax
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movl (%esp,%eax,4), %eax
; X32-NEXT: movl %ebp, %esp
@@ -48,9 +51,10 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
;
; X64-LABEL: var_extract:
; X64: # BB#0: # %entry
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movslq %edi, %rax
-; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: movl -24(%rsp,%rdi,4), %eax
; X64-NEXT: retq
entry:
%tmp3 = extractelement <4 x i32> %x, i32 %idx
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 6a81cdc490fe..923af1216d05 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4818,3 +4818,63 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
store <8 x float> %4, <8 x float>* %3, align 32
ret void
}
+
+define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cvtsi2sdl %edi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cvt = sitofp i32 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtsi2ssl %edi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_i32_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cvt = sitofp i32 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cvtsi2sdq %rdi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_i64_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cvt = sitofp i64 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtsi2ssq %rdi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_i64_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cvt = sitofp i64 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 440faa689fb8..9f0d4a7d7264 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -7,7 +7,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -89,6 +90,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -193,6 +199,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
@@ -339,6 +350,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $12, %xmm1
@@ -515,6 +539,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -624,6 +656,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
@@ -669,6 +706,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
@@ -712,6 +755,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
@@ -907,6 +956,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1033,6 +1091,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
@@ -1114,6 +1177,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1207,6 +1275,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1367,6 +1447,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -1480,6 +1567,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1514,6 +1606,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrad $5, %xmm0
@@ -1543,6 +1640,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psraw $3, %xmm0
@@ -1586,6 +1688,15 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 79902acfec24..aee2857157b6 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -5,6 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
;
; Variable Shifts
;
@@ -74,6 +77,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsubq %ymm3, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -135,6 +143,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -228,6 +241,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = ashr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -375,6 +401,42 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = ashr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -435,6 +497,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -476,6 +543,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = ashr <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -517,6 +590,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = ashr <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -662,6 +741,44 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = ashr <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -724,6 +841,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -769,6 +891,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -844,6 +971,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -981,6 +1120,42 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -1033,6 +1208,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -1068,6 +1248,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -1103,6 +1288,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -1160,6 +1350,15 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 2c9e433cfb2c..6cc98b5f3eeb 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
;
; Variable Shifts
;
@@ -99,399 +100,36 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -590,399 +228,36 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-LABEL: splatvar_shift_v64i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: sarb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = ashr <64 x i8> %a, %splat
@@ -1080,252 +355,36 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: sarb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: sarb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: sarb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: sarb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: sarb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: sarb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT: vpsraw $4, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsllw $5, {{.*}}(%rip), %zmm3
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm1, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm3[0],zmm0[1],zmm3[1],zmm0[2],zmm3[2],zmm0[3],zmm3[3],zmm0[4],zmm3[4],zmm0[5],zmm3[5],zmm0[6],zmm3[6],zmm0[7],zmm3[7],zmm0[16],zmm3[16],zmm0[17],zmm3[17],zmm0[18],zmm3[18],zmm0[19],zmm3[19],zmm0[20],zmm3[20],zmm0[21],zmm3[21],zmm0[22],zmm3[22],zmm0[23],zmm3[23],zmm0[32],zmm3[32],zmm0[33],zmm3[33],zmm0[34],zmm3[34],zmm0[35],zmm3[35],zmm0[36],zmm3[36],zmm0[37],zmm3[37],zmm0[38],zmm3[38],zmm0[39],zmm3[39],zmm0[48],zmm3[48],zmm0[49],zmm3[49],zmm0[50],zmm3[50],zmm0[51],zmm3[51],zmm0[52],zmm3[52],zmm0[53],zmm3[53],zmm0[54],zmm3[54],zmm0[55],zmm3[55]
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index a7e1a531b659..9b8c0def4558 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -7,6 +7,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -65,6 +67,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -162,6 +169,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
@@ -308,6 +320,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $12, %xmm1
@@ -433,6 +458,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $5, %xmm1
@@ -492,6 +525,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlq %xmm1, %xmm0
@@ -533,6 +571,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
@@ -576,6 +620,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
@@ -709,6 +759,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -798,6 +857,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -872,6 +936,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -965,6 +1034,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1071,6 +1152,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -1131,6 +1219,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlq $7, %xmm0
@@ -1160,6 +1253,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrld $5, %xmm0
@@ -1189,6 +1287,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
@@ -1223,6 +1326,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 25667e7d1661..58bb8f3e6ec0 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -5,6 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
;
; Variable Shifts
;
@@ -51,6 +54,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -112,6 +120,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -205,6 +218,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = lshr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -307,6 +333,30 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = lshr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -346,6 +396,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -387,6 +442,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = lshr <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -428,6 +489,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = lshr <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -532,6 +599,32 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = lshr <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -579,6 +672,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -624,6 +722,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -699,6 +802,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -795,6 +910,30 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -834,6 +973,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -869,6 +1013,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -904,6 +1053,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -947,6 +1101,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 3da8f9437e57..905445f30162 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -79,399 +79,21 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -553,399 +175,21 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-LABEL: splatvar_shift_v64i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shrb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = lshr <64 x i8> %a, %splat
@@ -1026,252 +270,21 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: shrb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: shrb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: shrb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: shrb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: shrb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 8706078b40c9..32334420f8b2 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -7,6 +7,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -63,6 +65,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -128,6 +135,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pslld $23, %xmm1
@@ -263,6 +275,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $12, %xmm1
@@ -383,6 +408,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $5, %xmm1
@@ -441,6 +474,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllq %xmm1, %xmm0
@@ -482,6 +520,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
@@ -525,6 +569,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
@@ -651,6 +701,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -737,6 +796,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -792,6 +856,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
@@ -836,6 +905,16 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
@@ -925,6 +1004,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -984,6 +1070,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllq $7, %xmm0
@@ -1013,6 +1104,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pslld $5, %xmm0
@@ -1042,6 +1138,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $3, %xmm0
@@ -1074,6 +1175,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index a1ef2791c1b0..104fa089c744 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -5,6 +5,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Variable Shifts
@@ -49,6 +51,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -93,6 +100,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -180,6 +192,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = shl <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -271,6 +296,29 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = shl <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -310,6 +358,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -351,6 +404,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = shl <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -392,6 +451,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = shl <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -487,6 +552,31 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = shl <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -531,6 +621,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -566,6 +661,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -609,6 +709,16 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -698,6 +808,29 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL: # BB#0:
+; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT: retq
%shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -737,6 +870,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $7, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -772,6 +910,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpslld $5, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -807,6 +950,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512: # BB#0:
; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -849,6 +997,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index b9c9b56427f1..180d6f3a3b03 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -76,399 +76,19 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -547,399 +167,19 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-LABEL: splatvar_shift_v64i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %sil
-; AVX512BW-NEXT: movzbl %dl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm5
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %sil
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: movzbl %sil, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %dl
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT: shlb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = shl <64 x i8> %a, %splat
@@ -1013,252 +253,19 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: shlb $7, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: shlb $6, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: shlb $5, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: shlb $4, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: shlb $3, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: shlb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: addb %al, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
new file mode 100644
index 000000000000..defc3e918b24
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
+
+;expand 128 -> 256 include <4 x float> <2 x double>
+define <8 x float> @expand(<4 x float> %a) {
+; SKX-LABEL: expand:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $5, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand:
+; KNL: # BB#0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
+; KNL-NEXT: retq
+ %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
+ ret <8 x float> %res
+}
+
+define <8 x float> @expand1(<4 x float> %a ) {
+; SKX-LABEL: expand1:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $-86, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand1:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
+; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT: retq
+ %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ ret <8 x float> %res
+}
+
+;Expand 128 -> 256 test <2 x double> -> <4 x double>
+define <4 x double> @expand2(<2 x double> %a) {
+; SKX-LABEL: expand2:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $9, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand2:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; KNL-NEXT: retq
+ %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
+ ret <4 x double> %res
+}
+
+;expand 128 -> 256 include case <4 x i32> <8 x i32>
+define <8 x i32> @expand3(<4 x i32> %a ) {
+; SKX-LABEL: expand3:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $-127, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand3:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vpbroadcastq %xmm0, %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL-NEXT: retq
+ %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
+ ret <8 x i32> %res
+}
+
+;expand 128 -> 256 include case <2 x i64> <4 x i64>
+define <4 x i64> @expand4(<2 x i64> %a ) {
+; SKX-LABEL: expand4:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $9, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand4:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL-NEXT: retq
+ %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
+ ret <4 x i64> %res
+}
+
+;Negative test for 128-> 256
+define <8 x float> @expand5(<4 x float> %a ) {
+; SKX-LABEL: expand5:
+; SKX: # BB#0:
+; SKX-NEXT: vbroadcastss %xmm0, %ymm0
+; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand5:
+; KNL: # BB#0:
+; KNL-NEXT: vbroadcastss %xmm0, %ymm0
+; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT: retq
+ %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+ ret <8 x float> %res
+}
+
+;expand 256 -> 512 include <8 x float> <16 x float>
+define <8 x float> @expand6(<4 x float> %a ) {
+; SKX-LABEL: expand6:
+; SKX: # BB#0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand6:
+; KNL: # BB#0:
+; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+ %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <16 x float> @expand7(<8 x float> %a) {
+; SKX-LABEL: expand7:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: movw $1285, %ax # imm = 0x505
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand7:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: movw $1285, %ax # imm = 0x505
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+ %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x float> %res
+}
+
+define <16 x float> @expand8(<8 x float> %a ) {
+; SKX-LABEL: expand8:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand8:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+ %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ ret <16 x float> %res
+}
+
+;expand 256 -> 512 include <4 x double> <8 x double>
+define <8 x double> @expand9(<4 x double> %a) {
+; SKX-LABEL: expand9:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: movb $-127, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand9:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: movb $-127, %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+ %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+ ret <8 x double> %res
+}
+
+define <16 x i32> @expand10(<8 x i32> %a ) {
+; SKX-LABEL: expand10:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand10:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+ %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ ret <16 x i32> %res
+}
+
+define <8 x i64> @expand11(<4 x i64> %a) {
+; SKX-LABEL: expand11:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: movb $-127, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand11:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: movb $-127, %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+ %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+ ret <8 x i64> %res
+}
+
+;Negative test for 256-> 512
+define <16 x float> @expand12(<8 x float> %a) {
+; SKX-LABEL: expand12:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; SKX-NEXT: vxorps %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand12:
+; KNL: # BB#0:
+; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+ %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+ ret <16 x float> %res
+}
+
+define <16 x float> @expand13(<8 x float> %a ) {
+; SKX-LABEL: expand13:
+; SKX: # BB#0:
+; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand13:
+; KNL: # BB#0:
+; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: retq
+ %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
+
+define <8 x float> @expand14(<4 x float> %a) {
+; SKX-LABEL: expand14:
+; SKX: # BB#0:
+; SKX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: movb $20, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand14:
+; KNL: # BB#0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
+; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT: retq
+ %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+ %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+ ret <8 x float> %res
+}
+
+;Negative test.
+define <8 x float> @expand15(<4 x float> %a) {
+; SKX-LABEL: expand15:
+; SKX: # BB#0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
+; SKX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
+; SKX-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: expand15:
+; KNL: # BB#0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
+; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT: retq
+ %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+ %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+ ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 23e40a6572af..b79df1facfa1 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -91,6 +91,20 @@ define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
ret <4 x float> %res1
}
+define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: combine_vpermil2ps_02zu:
+; X32: # BB#0:
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: combine_vpermil2ps_02zu:
+; X64: # BB#0:
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>, i8 0)
+ ret <4 x float> %res0
+}
+
define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: combine_vpermil2ps256_identity:
; X32: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index d130e7ff00b2..70b7fb16fc25 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -12,6 +12,8 @@
define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
; SSE: # BB#0:
+; SSE-NEXT: andl $1, %esi
+; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -19,6 +21,8 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
;
; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
; AVX: # BB#0:
+; AVX-NEXT: andl $1, %esi
+; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -33,9 +37,11 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
; SSE: # BB#0:
-; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movslq %esi, %rcx
+; SSE-NEXT: andl $1, %esi
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -43,9 +49,11 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
;
; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
; AVX: # BB#0:
-; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq %esi, %rcx
+; AVX-NEXT: andl $1, %esi
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -60,11 +68,15 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movslq %edi, %rax
-; SSE2-NEXT: movslq %esi, %rsi
-; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: andl $3, %edi
+; SSE2-NEXT: andl $3, %esi
+; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: andl $3, %ecx
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -76,11 +88,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
;
; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movslq %edi, %rax
-; SSSE3-NEXT: movslq %esi, %rsi
-; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: andl $3, %edi
+; SSSE3-NEXT: andl $3, %esi
+; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: andl $3, %ecx
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -92,11 +108,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
;
; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
; SSE41: # BB#0:
-; SSE41-NEXT: movslq %edi, %rax
-; SSE41-NEXT: movslq %esi, %rsi
-; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: andl $3, %edi
+; SSE41-NEXT: andl $3, %esi
+; SSE41-NEXT: andl $3, %edx
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: andl $3, %ecx
; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -105,11 +125,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
;
; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
; AVX: # BB#0:
-; AVX-NEXT: movslq %edi, %rax
-; AVX-NEXT: movslq %esi, %rsi
-; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: andl $3, %edi
+; AVX-NEXT: andl $3, %esi
+; AVX-NEXT: andl $3, %edx
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -129,11 +153,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movslq %edi, %rax
-; SSE2-NEXT: movslq %esi, %rsi
-; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: andl $3, %edi
+; SSE2-NEXT: andl $3, %esi
+; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: andl $3, %ecx
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -145,11 +173,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
;
; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movslq %edi, %rax
-; SSSE3-NEXT: movslq %esi, %rsi
-; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: andl $3, %edi
+; SSSE3-NEXT: andl $3, %esi
+; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: andl $3, %ecx
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -161,11 +193,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
;
; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
; SSE41: # BB#0:
-; SSE41-NEXT: movslq %edi, %rax
-; SSE41-NEXT: movslq %esi, %rsi
-; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: andl $3, %edi
+; SSE41-NEXT: andl $3, %esi
+; SSE41-NEXT: andl $3, %edx
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: andl $3, %ecx
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, -24(%rsp,%rsi,4), %xmm0
; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -174,11 +210,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
;
; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
; AVX: # BB#0:
-; AVX-NEXT: movslq %edi, %rax
-; AVX-NEXT: movslq %esi, %rsi
-; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: andl $3, %edi
+; AVX-NEXT: andl $3, %esi
+; AVX-NEXT: andl $3, %edx
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -204,34 +244,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT: movswq %di, %rax
-; SSE2-NEXT: movswq %si, %rsi
-; SSE2-NEXT: movswq %dx, %rdx
-; SSE2-NEXT: movswq %cx, %r10
-; SSE2-NEXT: movswq %r8w, %r11
+; SSE2-NEXT: andl $7, %edi
+; SSE2-NEXT: andl $7, %esi
+; SSE2-NEXT: andl $7, %edx
+; SSE2-NEXT: andl $7, %ecx
+; SSE2-NEXT: andl $7, %r8d
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movswq %r9w, %r8
-; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE2-NEXT: andl $7, %r9d
+; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: andl $7, %r10d
+; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $7, %eax
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %r10d
; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi
; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %r10d, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; SSE2-NEXT: movd %edx, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%r11,2), %eax
-; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %edx
+; SSE2-NEXT: movd %edx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: movd %edi, %xmm1
+; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: movd %esi, %xmm1
-; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -246,34 +288,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT: movswq %di, %rax
-; SSSE3-NEXT: movswq %si, %rsi
-; SSSE3-NEXT: movswq %dx, %rdx
-; SSSE3-NEXT: movswq %cx, %r10
-; SSSE3-NEXT: movswq %r8w, %r11
+; SSSE3-NEXT: andl $7, %edi
+; SSSE3-NEXT: andl $7, %esi
+; SSSE3-NEXT: andl $7, %edx
+; SSSE3-NEXT: andl $7, %ecx
+; SSSE3-NEXT: andl $7, %r8d
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movswq %r9w, %r8
-; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
-; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSSE3-NEXT: andl $7, %r9d
+; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: andl $7, %r10d
+; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $7, %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d
; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %r10d, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; SSSE3-NEXT: movd %edx, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%r11,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %edx
+; SSSE3-NEXT: movd %edx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: movd %edi, %xmm1
+; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -282,68 +326,66 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
; SSE41: # BB#0:
-; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT: movswq %di, %rax
-; SSE41-NEXT: movswq %si, %rbx
-; SSE41-NEXT: movswq %dx, %r11
-; SSE41-NEXT: movswq %cx, %r10
-; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: andl $7, %edi
+; SSE41-NEXT: andl $7, %esi
+; SSE41-NEXT: andl $7, %edx
+; SSE41-NEXT: andl $7, %ecx
+; SSE41-NEXT: andl $7, %r8d
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movswq %r9w, %rcx
-; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT: movzwl -16(%rsp,%rdx,2), %edx
-; SSE41-NEXT: movzwl -16(%rsp,%rsi,2), %esi
-; SSE41-NEXT: movzwl -16(%rsp,%rax,2), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrw $1, -16(%rsp,%rbx,2), %xmm0
-; SSE41-NEXT: pinsrw $2, -16(%rsp,%r11,2), %xmm0
-; SSE41-NEXT: pinsrw $3, -16(%rsp,%r10,2), %xmm0
-; SSE41-NEXT: pinsrw $4, -16(%rsp,%rdi,2), %xmm0
-; SSE41-NEXT: pinsrw $5, -16(%rsp,%rcx,2), %xmm0
-; SSE41-NEXT: pinsrw $6, %edx, %xmm0
-; SSE41-NEXT: pinsrw $7, %esi, %xmm0
-; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: andl $7, %r9d
+; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT: andl $7, %r10d
+; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $7, %eax
+; SSE41-NEXT: movzwl -24(%rsp,%r10,2), %r10d
+; SSE41-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE41-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm0
+; SSE41-NEXT: pinsrw $2, -24(%rsp,%rdx,2), %xmm0
+; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm0
+; SSE41-NEXT: pinsrw $4, -24(%rsp,%r8,2), %xmm0
+; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0
+; SSE41-NEXT: pinsrw $6, %r10d, %xmm0
+; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
; AVX: # BB#0:
-; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %rbx
; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT: movswq %di, %r10
-; AVX-NEXT: movswq %si, %r11
-; AVX-NEXT: movswq %dx, %r14
-; AVX-NEXT: movswq %cx, %rcx
-; AVX-NEXT: movswq %r8w, %rdi
+; AVX-NEXT: andl $7, %edi
+; AVX-NEXT: andl $7, %esi
+; AVX-NEXT: andl $7, %edx
+; AVX-NEXT: andl $7, %ecx
+; AVX-NEXT: andl $7, %r8d
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movswq %r9w, %rax
-; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; AVX-NEXT: movzwl -24(%rsp,%rdx,2), %edx
-; AVX-NEXT: movzwl -24(%rsp,%r10,2), %ebx
-; AVX-NEXT: vmovd %ebx, %xmm0
-; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
+; AVX-NEXT: andl $7, %r9d
+; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT: andl $7, %r10d
+; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $7, %eax
+; AVX-NEXT: movzwl -24(%rsp,%r10,2), %r10d
+; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: popq %r14
+; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <8 x i16> %x, i16 %i0
%x1 = extractelement <8 x i16> %x, i16 %i1
@@ -374,54 +416,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: andl $15, %r10d
; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
; SSE2-NEXT: movzbl (%r10,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm15
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm8
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm9
-; SSE2-NEXT: movsbq %dl, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %edx
+; SSE2-NEXT: movzbl (%rdx,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: movsbq %dil, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %edi
+; SSE2-NEXT: movzbl (%rdi,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm11
-; SSE2-NEXT: movsbq %r8b, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %r8d
+; SSE2-NEXT: movzbl (%r8,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm12
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm13
-; SSE2-NEXT: movsbq %cl, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl (%rcx,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm6
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm14
-; SSE2-NEXT: movsbq %sil, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movzbl (%rsi,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm5
-; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: movsbq %r9b, %rax
-; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: andl $15, %r9d
+; SSE2-NEXT: movzbl (%r9,%r11), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -449,54 +501,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: andl $15, %r10d
; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
; SSSE3-NEXT: movzbl (%r10,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm15
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm8
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm9
-; SSSE3-NEXT: movsbq %dl, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %edx
+; SSSE3-NEXT: movzbl (%rdx,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm10
-; SSSE3-NEXT: movsbq %dil, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %edi
+; SSSE3-NEXT: movzbl (%rdi,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: movsbq %r8b, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %r8d
+; SSSE3-NEXT: movzbl (%r8,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm7
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm12
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm13
-; SSSE3-NEXT: movsbq %cl, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl (%rcx,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm6
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: movsbq %sil, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %esi
+; SSSE3-NEXT: movzbl (%rsi,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm5
-; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: movsbq %r9b, %rax
-; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: andl $15, %r9d
+; SSSE3-NEXT: movzbl (%r9,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -520,7 +582,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %r15
; SSE41-NEXT: pushq %r14
-; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -529,54 +590,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT: movsbq %dil, %r15
-; SSE41-NEXT: movsbq %sil, %r14
-; SSE41-NEXT: movsbq %dl, %r11
-; SSE41-NEXT: movsbq %cl, %r10
-; SSE41-NEXT: movsbq %r8b, %r8
+; SSE41-NEXT: andl $15, %edi
+; SSE41-NEXT: andl $15, %esi
+; SSE41-NEXT: andl $15, %edx
+; SSE41-NEXT: andl $15, %ecx
+; SSE41-NEXT: andl $15, %r8d
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movsbq %r9b, %r9
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbx
+; SSE41-NEXT: andl $15, %r9d
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT: andl $15, %r10d
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE41-NEXT: andl $15, %r11d
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE41-NEXT: andl $15, %r14d
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE41-NEXT: andl $15, %r15d
; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT: movzbl (%r15,%rax), %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT: pinsrb $1, (%r14,%rax), %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
-; SSE41-NEXT: pinsrb $2, (%r11,%rax), %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
-; SSE41-NEXT: pinsrb $3, (%r10,%rax), %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movzbl (%rdi,%rax), %edi
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE41-NEXT: andl $15, %r12d
+; SSE41-NEXT: pinsrb $1, (%rsi,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSE41-NEXT: andl $15, %esi
+; SSE41-NEXT: pinsrb $2, (%rdx,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: andl $15, %edx
+; SSE41-NEXT: pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT: andl $15, %ecx
; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; SSE41-NEXT: andl $15, %ebx
; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movzbl (%r12,%rax), %esi
-; SSE41-NEXT: movzbl (%r13,%rax), %edi
-; SSE41-NEXT: movzbl (%rbp,%rax), %ebp
-; SSE41-NEXT: movzbl (%rbx,%rax), %ebx
-; SSE41-NEXT: movzbl (%r15,%rax), %r8d
-; SSE41-NEXT: movzbl (%r14,%rax), %r9d
-; SSE41-NEXT: movzbl (%r11,%rax), %r11d
-; SSE41-NEXT: movzbl (%r10,%rax), %r10d
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE41-NEXT: andl $15, %edi
+; SSE41-NEXT: movzbl (%r10,%rax), %r8d
+; SSE41-NEXT: movzbl (%r11,%rax), %r9d
+; SSE41-NEXT: movzbl (%r14,%rax), %r10d
+; SSE41-NEXT: movzbl (%r15,%rax), %r11d
+; SSE41-NEXT: movzbl (%r12,%rax), %ebp
+; SSE41-NEXT: movzbl (%rsi,%rax), %esi
+; SSE41-NEXT: movzbl (%rdx,%rax), %edx
; SSE41-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE41-NEXT: movzbl (%rdx,%rax), %eax
-; SSE41-NEXT: pinsrb $6, %esi, %xmm0
-; SSE41-NEXT: pinsrb $7, %edi, %xmm0
-; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT: pinsrb $9, %ebx, %xmm0
-; SSE41-NEXT: pinsrb $10, %r8d, %xmm0
-; SSE41-NEXT: pinsrb $11, %r9d, %xmm0
-; SSE41-NEXT: pinsrb $12, %r11d, %xmm0
-; SSE41-NEXT: pinsrb $13, %r10d, %xmm0
-; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT: movzbl (%rbx,%rax), %ebx
+; SSE41-NEXT: movzbl (%rdi,%rax), %eax
+; SSE41-NEXT: pinsrb $6, %r8d, %xmm0
+; SSE41-NEXT: pinsrb $7, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r10d, %xmm0
+; SSE41-NEXT: pinsrb $9, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $10, %ebp, %xmm0
+; SSE41-NEXT: pinsrb $11, %esi, %xmm0
+; SSE41-NEXT: pinsrb $12, %edx, %xmm0
+; SSE41-NEXT: pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $14, %ebx, %xmm0
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
-; SSE41-NEXT: popq %r13
; SSE41-NEXT: popq %r14
; SSE41-NEXT: popq %r15
; SSE41-NEXT: popq %rbp
@@ -587,7 +657,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r15
; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -596,54 +665,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT: movsbq %dil, %r10
-; AVX-NEXT: movsbq %sil, %r11
-; AVX-NEXT: movsbq %dl, %r14
-; AVX-NEXT: movsbq %cl, %r15
-; AVX-NEXT: movsbq %r8b, %r8
+; AVX-NEXT: andl $15, %edi
+; AVX-NEXT: andl $15, %esi
+; AVX-NEXT: andl $15, %edx
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: andl $15, %r8d
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movsbq %r9b, %r9
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rdi
-; AVX-NEXT: movzbl (%r10,%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT: vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
-; AVX-NEXT: vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
-; AVX-NEXT: vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
-; AVX-NEXT: vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT: vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
-; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT: movzbl (%r12,%rdi), %edx
-; AVX-NEXT: movzbl (%r13,%rdi), %ebx
-; AVX-NEXT: movzbl (%rbp,%rdi), %ebp
-; AVX-NEXT: movzbl (%rcx,%rdi), %ecx
-; AVX-NEXT: movzbl (%r10,%rdi), %eax
-; AVX-NEXT: movzbl (%r11,%rdi), %r9d
-; AVX-NEXT: movzbl (%r14,%rdi), %r10d
-; AVX-NEXT: movzbl (%r15,%rdi), %r11d
-; AVX-NEXT: movzbl (%r8,%rdi), %r8d
-; AVX-NEXT: movzbl (%rsi,%rdi), %esi
-; AVX-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $7, %ebx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $15, %esi, %xmm0, %xmm0
+; AVX-NEXT: andl $15, %r9d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT: andl $15, %r10d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX-NEXT: andl $15, %r11d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX-NEXT: andl $15, %r14d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT: andl $15, %r15d
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movzbl (%rdi,%rax), %edi
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX-NEXT: andl $15, %r12d
+; AVX-NEXT: vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX-NEXT: andl $15, %esi
+; AVX-NEXT: vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX-NEXT: andl $15, %edx
+; AVX-NEXT: vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX-NEXT: andl $15, %ebx
+; AVX-NEXT: vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX-NEXT: andl $15, %edi
+; AVX-NEXT: movzbl (%r10,%rax), %r8d
+; AVX-NEXT: movzbl (%r11,%rax), %r9d
+; AVX-NEXT: movzbl (%r14,%rax), %r10d
+; AVX-NEXT: movzbl (%r15,%rax), %r11d
+; AVX-NEXT: movzbl (%r12,%rax), %ebp
+; AVX-NEXT: movzbl (%rsi,%rax), %esi
+; AVX-NEXT: movzbl (%rdx,%rax), %edx
+; AVX-NEXT: movzbl (%rcx,%rax), %ecx
+; AVX-NEXT: movzbl (%rbx,%rax), %ebx
+; AVX-NEXT: movzbl (%rdi,%rax), %eax
+; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %ebx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
-; AVX-NEXT: popq %r13
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %r15
; AVX-NEXT: popq %rbp
@@ -690,11 +768,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movslq (%rdi), %rax
+; SSE2-NEXT: movl (%rdi), %eax
+; SSE2-NEXT: movl 4(%rdi), %ecx
+; SSE2-NEXT: andl $3, %eax
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movslq 4(%rdi), %rcx
-; SSE2-NEXT: movslq 8(%rdi), %rdx
-; SSE2-NEXT: movslq 12(%rdi), %rsi
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: movl 8(%rdi), %edx
+; SSE2-NEXT: andl $3, %edx
+; SSE2-NEXT: movl 12(%rdi), %esi
+; SSE2-NEXT: andl $3, %esi
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -706,11 +788,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
;
; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movslq (%rdi), %rax
+; SSSE3-NEXT: movl (%rdi), %eax
+; SSSE3-NEXT: movl 4(%rdi), %ecx
+; SSSE3-NEXT: andl $3, %eax
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movslq 4(%rdi), %rcx
-; SSSE3-NEXT: movslq 8(%rdi), %rdx
-; SSSE3-NEXT: movslq 12(%rdi), %rsi
+; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: movl 8(%rdi), %edx
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movl 12(%rdi), %esi
+; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -722,11 +808,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
;
; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
; SSE41: # BB#0:
-; SSE41-NEXT: movslq (%rdi), %rax
+; SSE41-NEXT: movl (%rdi), %eax
+; SSE41-NEXT: movl 4(%rdi), %ecx
+; SSE41-NEXT: andl $3, %eax
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movslq 4(%rdi), %rcx
-; SSE41-NEXT: movslq 8(%rdi), %rdx
-; SSE41-NEXT: movslq 12(%rdi), %rsi
+; SSE41-NEXT: andl $3, %ecx
+; SSE41-NEXT: movl 8(%rdi), %edx
+; SSE41-NEXT: andl $3, %edx
+; SSE41-NEXT: movl 12(%rdi), %esi
+; SSE41-NEXT: andl $3, %esi
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, -24(%rsp,%rcx,4), %xmm0
; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -735,11 +825,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
;
; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
; AVX: # BB#0:
-; AVX-NEXT: movslq (%rdi), %rax
+; AVX-NEXT: movl (%rdi), %eax
+; AVX-NEXT: movl 4(%rdi), %ecx
+; AVX-NEXT: andl $3, %eax
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq 4(%rdi), %rcx
-; AVX-NEXT: movslq 8(%rdi), %rdx
-; AVX-NEXT: movslq 12(%rdi), %rsi
+; AVX-NEXT: andl $3, %ecx
+; AVX-NEXT: movl 8(%rdi), %edx
+; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: movl 12(%rdi), %esi
+; AVX-NEXT: andl $3, %esi
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -767,55 +861,71 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movsbq (%rdi), %rcx
+; SSE2-NEXT: movzbl (%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movsbq 8(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm8
-; SSE2-NEXT: movsbq 12(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm9
-; SSE2-NEXT: movsbq 4(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm3
-; SSE2-NEXT: movsbq 14(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm10
-; SSE2-NEXT: movsbq 6(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm5
-; SSE2-NEXT: movsbq 10(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm11
-; SSE2-NEXT: movsbq 2(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm7
-; SSE2-NEXT: movsbq 15(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm12
-; SSE2-NEXT: movsbq 7(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: movsbq 11(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm13
-; SSE2-NEXT: movsbq 3(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm6
-; SSE2-NEXT: movsbq 13(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm14
-; SSE2-NEXT: movsbq 5(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm4
-; SSE2-NEXT: movsbq 9(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT: movd %ecx, %xmm15
-; SSE2-NEXT: movsbq 1(%rdi), %rcx
-; SSE2-NEXT: movzbl (%rcx,%rax), %eax
+; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl 8(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movzbl 12(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: movzbl 4(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movzbl 14(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movzbl 6(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movzbl 10(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: movzbl 2(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: movzbl 15(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movzbl 7(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movzbl 11(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: movzbl 3(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movzbl 13(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movzbl 5(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movzbl 9(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: movzbl 1(%rdi), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%rcx), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -836,55 +946,71 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
;
; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movsbq (%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movsbq 8(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm8
-; SSSE3-NEXT: movsbq 12(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm9
-; SSSE3-NEXT: movsbq 4(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movsbq 14(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm10
-; SSSE3-NEXT: movsbq 6(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm5
-; SSSE3-NEXT: movsbq 10(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm11
-; SSSE3-NEXT: movsbq 2(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm7
-; SSSE3-NEXT: movsbq 15(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm12
-; SSSE3-NEXT: movsbq 7(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movsbq 11(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm13
-; SSSE3-NEXT: movsbq 3(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm6
-; SSSE3-NEXT: movsbq 13(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm14
-; SSSE3-NEXT: movsbq 5(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movsbq 9(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm15
-; SSSE3-NEXT: movsbq 1(%rdi), %rcx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
+; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzbl 8(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: movzbl 12(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movzbl 4(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: movzbl 14(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm10
+; SSSE3-NEXT: movzbl 6(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: movzbl 10(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: movzbl 2(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm7
+; SSSE3-NEXT: movzbl 15(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movzbl 7(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movzbl 11(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: movzbl 3(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: movzbl 13(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movzbl 5(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movzbl 9(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movzbl 1(%rdi), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -911,55 +1037,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movsbq (%rdi), %rax
+; SSE41-NEXT: movzbl (%rdi), %r11d
+; SSE41-NEXT: andl $15, %r11d
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movsbq 1(%rdi), %r15
-; SSE41-NEXT: movsbq 2(%rdi), %r8
-; SSE41-NEXT: movsbq 3(%rdi), %r9
-; SSE41-NEXT: movsbq 4(%rdi), %r10
-; SSE41-NEXT: movsbq 5(%rdi), %r11
-; SSE41-NEXT: movsbq 6(%rdi), %r14
-; SSE41-NEXT: movsbq 7(%rdi), %r12
-; SSE41-NEXT: movsbq 8(%rdi), %r13
-; SSE41-NEXT: movsbq 9(%rdi), %rdx
-; SSE41-NEXT: movsbq 10(%rdi), %rcx
-; SSE41-NEXT: movsbq 11(%rdi), %rsi
-; SSE41-NEXT: movsbq 12(%rdi), %rbx
+; SSE41-NEXT: movzbl 1(%rdi), %r9d
+; SSE41-NEXT: andl $15, %r9d
+; SSE41-NEXT: movzbl 2(%rdi), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT: movzbl 3(%rdi), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT: movzbl 4(%rdi), %r14d
+; SSE41-NEXT: andl $15, %r14d
+; SSE41-NEXT: movzbl 5(%rdi), %r15d
+; SSE41-NEXT: andl $15, %r15d
+; SSE41-NEXT: movzbl 6(%rdi), %r12d
+; SSE41-NEXT: andl $15, %r12d
+; SSE41-NEXT: movzbl 7(%rdi), %r13d
+; SSE41-NEXT: andl $15, %r13d
+; SSE41-NEXT: movzbl 8(%rdi), %r8d
+; SSE41-NEXT: andl $15, %r8d
+; SSE41-NEXT: movzbl 9(%rdi), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: movzbl 10(%rdi), %ecx
+; SSE41-NEXT: andl $15, %ecx
+; SSE41-NEXT: movzbl 11(%rdi), %edx
+; SSE41-NEXT: andl $15, %edx
+; SSE41-NEXT: movzbl 12(%rdi), %esi
+; SSE41-NEXT: andl $15, %esi
; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movzbl (%rax,%rbp), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: movsbq 13(%rdi), %rax
-; SSE41-NEXT: pinsrb $1, (%r15,%rbp), %xmm0
-; SSE41-NEXT: movsbq 14(%rdi), %r15
-; SSE41-NEXT: movsbq 15(%rdi), %rdi
-; SSE41-NEXT: movzbl (%rdi,%rbp), %edi
-; SSE41-NEXT: movzbl (%r15,%rbp), %r15d
-; SSE41-NEXT: movzbl (%rax,%rbp), %eax
-; SSE41-NEXT: movzbl (%rbx,%rbp), %ebx
+; SSE41-NEXT: movzbl (%r11,%rbp), %ebx
+; SSE41-NEXT: movd %ebx, %xmm0
+; SSE41-NEXT: movzbl 13(%rdi), %r11d
+; SSE41-NEXT: andl $15, %r11d
+; SSE41-NEXT: pinsrb $1, (%r9,%rbp), %xmm0
+; SSE41-NEXT: movzbl 14(%rdi), %ebx
+; SSE41-NEXT: andl $15, %ebx
+; SSE41-NEXT: movzbl 15(%rdi), %edi
+; SSE41-NEXT: andl $15, %edi
+; SSE41-NEXT: movzbl (%rdi,%rbp), %r10d
+; SSE41-NEXT: movzbl (%rbx,%rbp), %r9d
+; SSE41-NEXT: movzbl (%r11,%rbp), %r11d
; SSE41-NEXT: movzbl (%rsi,%rbp), %esi
-; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx
; SSE41-NEXT: movzbl (%rdx,%rbp), %edx
+; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx
+; SSE41-NEXT: movzbl (%rax,%rbp), %eax
+; SSE41-NEXT: movzbl (%r8,%rbp), %r8d
; SSE41-NEXT: movzbl (%r13,%rbp), %r13d
; SSE41-NEXT: movzbl (%r12,%rbp), %r12d
+; SSE41-NEXT: movzbl (%r15,%rbp), %r15d
; SSE41-NEXT: movzbl (%r14,%rbp), %r14d
-; SSE41-NEXT: movzbl (%r11,%rbp), %r11d
-; SSE41-NEXT: movzbl (%r10,%rbp), %r10d
-; SSE41-NEXT: movzbl (%r9,%rbp), %r9d
-; SSE41-NEXT: movzbl (%r8,%rbp), %ebp
+; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; SSE41-NEXT: movzbl (%rdi,%rbp), %edi
+; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; SSE41-NEXT: movzbl (%rbx,%rbp), %ebp
; SSE41-NEXT: pinsrb $2, %ebp, %xmm0
-; SSE41-NEXT: pinsrb $3, %r9d, %xmm0
-; SSE41-NEXT: pinsrb $4, %r10d, %xmm0
-; SSE41-NEXT: pinsrb $5, %r11d, %xmm0
-; SSE41-NEXT: pinsrb $6, %r14d, %xmm0
-; SSE41-NEXT: pinsrb $7, %r12d, %xmm0
-; SSE41-NEXT: pinsrb $8, %r13d, %xmm0
-; SSE41-NEXT: pinsrb $9, %edx, %xmm0
+; SSE41-NEXT: pinsrb $3, %edi, %xmm0
+; SSE41-NEXT: pinsrb $4, %r14d, %xmm0
+; SSE41-NEXT: pinsrb $5, %r15d, %xmm0
+; SSE41-NEXT: pinsrb $6, %r12d, %xmm0
+; SSE41-NEXT: pinsrb $7, %r13d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r8d, %xmm0
+; SSE41-NEXT: pinsrb $9, %eax, %xmm0
; SSE41-NEXT: pinsrb $10, %ecx, %xmm0
-; SSE41-NEXT: pinsrb $11, %esi, %xmm0
-; SSE41-NEXT: pinsrb $12, %ebx, %xmm0
-; SSE41-NEXT: pinsrb $13, %eax, %xmm0
-; SSE41-NEXT: pinsrb $14, %r15d, %xmm0
-; SSE41-NEXT: pinsrb $15, %edi, %xmm0
+; SSE41-NEXT: pinsrb $11, %edx, %xmm0
+; SSE41-NEXT: pinsrb $12, %esi, %xmm0
+; SSE41-NEXT: pinsrb $13, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $14, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $15, %r10d, %xmm0
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -976,55 +1122,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movsbq (%rdi), %rsi
+; AVX-NEXT: movzbl (%rdi), %r11d
+; AVX-NEXT: andl $15, %r11d
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movsbq 1(%rdi), %r15
-; AVX-NEXT: movsbq 2(%rdi), %r8
-; AVX-NEXT: movsbq 3(%rdi), %r9
-; AVX-NEXT: movsbq 4(%rdi), %r10
-; AVX-NEXT: movsbq 5(%rdi), %r11
-; AVX-NEXT: movsbq 6(%rdi), %r14
-; AVX-NEXT: movsbq 7(%rdi), %r12
-; AVX-NEXT: movsbq 8(%rdi), %r13
-; AVX-NEXT: movsbq 9(%rdi), %rdx
-; AVX-NEXT: movsbq 10(%rdi), %rax
-; AVX-NEXT: movsbq 11(%rdi), %rcx
-; AVX-NEXT: movsbq 12(%rdi), %rbx
+; AVX-NEXT: movzbl 1(%rdi), %r9d
+; AVX-NEXT: andl $15, %r9d
+; AVX-NEXT: movzbl 2(%rdi), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT: movzbl 3(%rdi), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT: movzbl 4(%rdi), %r14d
+; AVX-NEXT: andl $15, %r14d
+; AVX-NEXT: movzbl 5(%rdi), %r15d
+; AVX-NEXT: andl $15, %r15d
+; AVX-NEXT: movzbl 6(%rdi), %r12d
+; AVX-NEXT: andl $15, %r12d
+; AVX-NEXT: movzbl 7(%rdi), %r13d
+; AVX-NEXT: andl $15, %r13d
+; AVX-NEXT: movzbl 8(%rdi), %r8d
+; AVX-NEXT: andl $15, %r8d
+; AVX-NEXT: movzbl 9(%rdi), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: movzbl 10(%rdi), %ecx
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: movzbl 11(%rdi), %edx
+; AVX-NEXT: andl $15, %edx
+; AVX-NEXT: movzbl 12(%rdi), %esi
+; AVX-NEXT: andl $15, %esi
; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movzbl (%r11,%rbp), %ebx
+; AVX-NEXT: vmovd %ebx, %xmm0
+; AVX-NEXT: movzbl 13(%rdi), %r11d
+; AVX-NEXT: andl $15, %r11d
+; AVX-NEXT: vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0
+; AVX-NEXT: movzbl 14(%rdi), %ebx
+; AVX-NEXT: andl $15, %ebx
+; AVX-NEXT: movzbl 15(%rdi), %edi
+; AVX-NEXT: andl $15, %edi
+; AVX-NEXT: movzbl (%rdi,%rbp), %r10d
+; AVX-NEXT: movzbl (%rbx,%rbp), %r9d
+; AVX-NEXT: movzbl (%r11,%rbp), %r11d
; AVX-NEXT: movzbl (%rsi,%rbp), %esi
-; AVX-NEXT: vmovd %esi, %xmm0
-; AVX-NEXT: movsbq 13(%rdi), %rsi
-; AVX-NEXT: vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
-; AVX-NEXT: movsbq 14(%rdi), %r15
-; AVX-NEXT: movsbq 15(%rdi), %rdi
-; AVX-NEXT: movzbl (%rdi,%rbp), %edi
-; AVX-NEXT: movzbl (%r15,%rbp), %r15d
-; AVX-NEXT: movzbl (%rsi,%rbp), %esi
-; AVX-NEXT: movzbl (%rbx,%rbp), %ebx
+; AVX-NEXT: movzbl (%rdx,%rbp), %edx
; AVX-NEXT: movzbl (%rcx,%rbp), %ecx
; AVX-NEXT: movzbl (%rax,%rbp), %eax
-; AVX-NEXT: movzbl (%rdx,%rbp), %edx
+; AVX-NEXT: movzbl (%r8,%rbp), %r8d
; AVX-NEXT: movzbl (%r13,%rbp), %r13d
; AVX-NEXT: movzbl (%r12,%rbp), %r12d
+; AVX-NEXT: movzbl (%r15,%rbp), %r15d
; AVX-NEXT: movzbl (%r14,%rbp), %r14d
-; AVX-NEXT: movzbl (%r11,%rbp), %r11d
-; AVX-NEXT: movzbl (%r10,%rbp), %r10d
-; AVX-NEXT: movzbl (%r9,%rbp), %r9d
-; AVX-NEXT: movzbl (%r8,%rbp), %ebp
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX-NEXT: movzbl (%rdi,%rbp), %edi
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; AVX-NEXT: movzbl (%rbx,%rbp), %ebp
; AVX-NEXT: vpinsrb $2, %ebp, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $3, %r9d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $12, %ebx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $14, %r15d, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %r10d, %xmm0, %xmm0
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
@@ -1106,11 +1272,14 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
; SSE: # BB#0:
-; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movslq %edx, %rdx
+; SSE-NEXT: andl $3, %edx
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movslq %ecx, %rcx
+; SSE-NEXT: andl $3, %ecx
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1120,11 +1289,14 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
;
; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
; AVX: # BB#0:
-; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: andl $3, %edi
; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: andl $3, %edx
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1151,31 +1323,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT: movswq %di, %r10
-; SSE2-NEXT: movswq %si, %rsi
-; SSE2-NEXT: movswq %dx, %r11
-; SSE2-NEXT: movswq %cx, %rcx
+; SSE2-NEXT: andl $7, %edi
+; SSE2-NEXT: andl $7, %esi
+; SSE2-NEXT: andl $7, %edx
+; SSE2-NEXT: andl $7, %ecx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movswq %r8w, %rdi
+; SSE2-NEXT: andl $7, %r8d
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movswq %r9w, %rax
-; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT: xorl %edx, %edx
-; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: andl $7, %r9d
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %esi, %xmm2
-; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: movzwl -40(%rsp,%r10,2), %eax
-; SSE2-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1190,31 +1362,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT: movswq %di, %r10
-; SSSE3-NEXT: movswq %si, %rsi
-; SSSE3-NEXT: movswq %dx, %r11
-; SSSE3-NEXT: movswq %cx, %rcx
+; SSSE3-NEXT: andl $7, %edi
+; SSSE3-NEXT: andl $7, %esi
+; SSSE3-NEXT: andl $7, %edx
+; SSSE3-NEXT: andl $7, %ecx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movswq %r8w, %rdi
+; SSSE3-NEXT: andl $7, %r8d
; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movswq %r9w, %rax
-; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT: xorl %edx, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: andl $7, %r9d
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT: xorl %esi, %esi
+; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movd %esi, %xmm2
-; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: movzwl -40(%rsp,%r10,2), %eax
-; SSSE3-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1229,21 +1401,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT: movswq %di, %rax
-; SSE41-NEXT: movswq %si, %rsi
-; SSE41-NEXT: movswq %dx, %rdx
-; SSE41-NEXT: movswq %cx, %r10
+; SSE41-NEXT: andl $7, %edi
+; SSE41-NEXT: andl $7, %esi
+; SSE41-NEXT: andl $7, %edx
+; SSE41-NEXT: andl $7, %ecx
; SSE41-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: andl $7, %r8d
; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movswq %r9w, %rcx
-; SSE41-NEXT: movzwl -40(%rsp,%rax,2), %eax
+; SSE41-NEXT: andl $7, %r9d
+; SSE41-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSE41-NEXT: movd %eax, %xmm1
; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm1
; SSE41-NEXT: pinsrw $2, -40(%rsp,%rdx,2), %xmm1
-; SSE41-NEXT: pinsrw $3, -24(%rsp,%r10,2), %xmm1
-; SSE41-NEXT: pinsrw $4, -40(%rsp,%rdi,2), %xmm1
-; SSE41-NEXT: pinsrw $5, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT: pinsrw $4, -40(%rsp,%r8,2), %xmm1
+; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
@@ -1256,21 +1428,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT: movswq %di, %r10
-; AVX1-NEXT: movswq %si, %r11
-; AVX1-NEXT: movswq %dx, %rdx
-; AVX1-NEXT: movswq %cx, %rcx
+; AVX1-NEXT: andl $7, %edi
+; AVX1-NEXT: andl $7, %esi
+; AVX1-NEXT: andl $7, %edx
+; AVX1-NEXT: andl $7, %ecx
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movswq %r8w, %rdi
+; AVX1-NEXT: andl $7, %r8d
; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movswq %r9w, %rax
-; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-NEXT: retq
@@ -1283,21 +1455,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX2-NEXT: movswq %di, %r10
-; AVX2-NEXT: movswq %si, %r11
-; AVX2-NEXT: movswq %dx, %rdx
-; AVX2-NEXT: movswq %cx, %rcx
+; AVX2-NEXT: andl $7, %edi
+; AVX2-NEXT: andl $7, %esi
+; AVX2-NEXT: andl $7, %edx
+; AVX2-NEXT: andl $7, %ecx
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movswq %r8w, %rdi
+; AVX2-NEXT: andl $7, %r8d
; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movswq %r9w, %rax
-; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $7, %r9d
+; AVX2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
index b43ec058ed91..42b3c11d3d6b 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -13,6 +13,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0,
; ALL-NEXT: movq %rsp, %rbp
; ALL-NEXT: andq $-32, %rsp
; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: andl $3, %ecx
+; ALL-NEXT: andl $3, %edx
+; ALL-NEXT: andl $3, %esi
+; ALL-NEXT: andl $3, %edi
; ALL-NEXT: vmovaps %ymm0, (%rsp)
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -40,6 +44,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
; ALL-NEXT: movq %rsp, %rbp
; ALL-NEXT: andq $-32, %rsp
; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: andl $3, %edx
+; ALL-NEXT: andl $3, %esi
; ALL-NEXT: vmovaps %ymm0, (%rsp)
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -62,6 +68,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
; ALL: # BB#0:
+; ALL-NEXT: andl $1, %ecx
+; ALL-NEXT: andl $1, %edx
+; ALL-NEXT: andl $1, %esi
+; ALL-NEXT: andl $1, %edi
; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -87,6 +97,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: andl $3, %edx
+; AVX1-NEXT: andl $3, %esi
+; AVX1-NEXT: andl $3, %edi
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -105,6 +119,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: andl $3, %edi
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -134,6 +152,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: andl $3, %esi
+; AVX1-NEXT: andl $3, %edi
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -150,6 +170,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: andl $3, %edi
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -173,6 +195,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
; AVX1: # BB#0:
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: andl $1, %esi
+; AVX1-NEXT: andl $1, %edi
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -185,6 +211,10 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i
;
; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
; AVX2: # BB#0:
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: andl $1, %edi
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -212,15 +242,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: movslq %edi, %rax
-; AVX1-NEXT: movslq %esi, %rsi
-; AVX1-NEXT: movslq %edx, %rdx
-; AVX1-NEXT: movslq %ecx, %r11
-; AVX1-NEXT: movslq %r8d, %r10
+; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT: andl $7, %edi
+; AVX1-NEXT: andl $7, %esi
+; AVX1-NEXT: andl $7, %edx
+; AVX1-NEXT: andl $7, %ecx
+; AVX1-NEXT: andl $7, %r8d
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: movslq %r9d, %r8
-; AVX1-NEXT: movslq 16(%rbp), %rdi
-; AVX1-NEXT: movslq 24(%rbp), %rcx
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: movl 16(%rbp), %r10d
+; AVX1-NEXT: andl $7, %r10d
+; AVX1-NEXT: movl 24(%rbp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -284,15 +322,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
; ALL: # BB#0:
-; ALL-NEXT: movslq %edi, %rax
-; ALL-NEXT: movslq %esi, %rsi
-; ALL-NEXT: movslq %edx, %rdx
-; ALL-NEXT: movslq %ecx, %r11
-; ALL-NEXT: movslq %r8d, %r10
+; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT: andl $3, %edi
+; ALL-NEXT: andl $3, %esi
+; ALL-NEXT: andl $3, %edx
+; ALL-NEXT: andl $3, %ecx
+; ALL-NEXT: andl $3, %r8d
; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: movslq %r9d, %r8
-; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi
-; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx
+; ALL-NEXT: andl $3, %r9d
+; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; ALL-NEXT: andl $3, %r10d
+; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; ALL-NEXT: andl $3, %eax
; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -331,48 +377,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: movslq 32(%rbp), %rax
+; AVX1-NEXT: movl 32(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: movslq 40(%rbp), %rax
+; AVX1-NEXT: movl 40(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 48(%rbp), %rax
+; AVX1-NEXT: movl 48(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 56(%rbp), %rax
+; AVX1-NEXT: movl 56(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 64(%rbp), %rax
+; AVX1-NEXT: movl 64(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 72(%rbp), %rax
+; AVX1-NEXT: movl 72(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 80(%rbp), %rax
+; AVX1-NEXT: movl 80(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq 88(%rbp), %rax
+; AVX1-NEXT: movl 88(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq %edi, %rax
-; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: andl $15, %edi
+; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax
; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: movslq %esi, %rax
-; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %edx, %rax
-; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %ecx, %rax
-; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %r8d, %rax
-; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %r9d, %rax
-; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq 16(%rbp), %rax
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $15, %r8d
+; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $15, %r9d
+; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT: movl 16(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movslq 24(%rbp), %rax
+; AVX1-NEXT: movl 24(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -386,48 +448,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; AVX2-NEXT: movslq 32(%rbp), %rax
+; AVX2-NEXT: movl 32(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: movslq 40(%rbp), %rax
+; AVX2-NEXT: movl 40(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 48(%rbp), %rax
+; AVX2-NEXT: movl 48(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 56(%rbp), %rax
+; AVX2-NEXT: movl 56(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 64(%rbp), %rax
+; AVX2-NEXT: movl 64(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 72(%rbp), %rax
+; AVX2-NEXT: movl 72(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 80(%rbp), %rax
+; AVX2-NEXT: movl 80(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq 88(%rbp), %rax
+; AVX2-NEXT: movl 88(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq %edi, %rax
-; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: andl $15, %edi
+; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax
; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: movslq %esi, %rax
-; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %edx, %rax
-; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %ecx, %rax
-; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %r8d, %rax
-; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %r9d, %rax
-; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq 16(%rbp), %rax
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $15, %r8d
+; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $15, %r9d
+; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT: movl 16(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movslq 24(%rbp), %rax
+; AVX2-NEXT: movl 24(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -472,48 +550,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movslq %edi, %rax
-; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: andl $7, %edi
+; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: movslq %esi, %rax
-; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %edx, %rax
-; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %ecx, %rax
-; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %r8d, %rax
-; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq %r9d, %rax
-; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: andl $7, %esi
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $7, %edx
+; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $7, %ecx
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $7, %r8d
+; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -521,48 +615,64 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
;
; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movslq %edi, %rax
-; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: andl $7, %edi
+; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: movslq %esi, %rax
-; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %edx, %rax
-; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %ecx, %rax
-; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %r8d, %rax
-; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq %r9d, %rax
-; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: andl $7, %esi
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $7, %edx
+; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $7, %ecx
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $7, %r8d
+; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT: andl $7, %r9d
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -615,8 +725,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
; AVX1-NEXT: subq $64, %rsp
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: andl $3, %eax
+; AVX1-NEXT: andl $3, %ecx
; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: andl $3, %edx
; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: andl $3, %esi
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -637,8 +751,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
; AVX2-NEXT: subq $64, %rsp
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: andl $3, %edx
; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: andl $3, %esi
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -674,8 +792,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
; AVX1: # BB#0:
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: andl $1, %esi
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -690,8 +812,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
; AVX2: # BB#0:
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: andl $1, %esi
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
index 8d2dbbdb5d24..c025ee874b2d 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -17,6 +17,7 @@
; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
; RUN: llc < %s -march=x86-64 -mcpu=bdver3 | FileCheck %s
; RUN: llc < %s -march=x86-64 -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=znver1 | FileCheck %s
; Verify that for the X86_64 processors that are known to have poor latency
; double precision shift instructions we do not generate 'shld' or 'shrd'