aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
Diffstat (limited to 'test')
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll29
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll48
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll16
-rw-r--r--test/CodeGen/X86/pr34080-2.ll136
-rw-r--r--test/CodeGen/X86/pr34080.ll78
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll2995
-rw-r--r--test/CodeGen/X86/v8i1-masks.ll102
-rw-r--r--test/MC/ELF/comdat-name-number.s28
8 files changed, 2395 insertions, 1037 deletions
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 248462d0de51..821c65bef06a 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -228,14 +228,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: callq _func8xi1
-; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
-; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: movb $85, %al
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
+; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
;
@@ -247,12 +242,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func8xi1
+; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT: vpmovw2m %xmm0, %k0
-; SKX-NEXT: movb $85, %al
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: kandb %k1, %k0, %k0
-; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vpsraw $15, %xmm0, %xmm0
; SKX-NEXT: popq %rax
; SKX-NEXT: retq
;
@@ -264,14 +256,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_X32-NEXT: calll _func8xi1
-; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
-; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL_X32-NEXT: movb $85, %al
-; KNL_X32-NEXT: kmovw %eax, %k1
-; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_X32-NEXT: vpand LCPI7_0, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <8 x i32>%a, %b
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index e88ec9d7b159..18e9f306bc1b 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1691,8 +1691,8 @@ define <2 x double> @sbto2f64(<2 x double> %a) {
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
-; VLDQ-NEXT: vpmovm2q %k0, %xmm0
-; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT: vpmovm2d %k0, %xmm0
+; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto2f64:
@@ -1700,12 +1700,8 @@ define <2 x double> @sbto2f64(<2 x double> %a) {
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
@@ -2002,30 +1998,22 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; NOVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: ubto2f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: ubto2f64:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VLNODQ-NEXT: retq
+; VL-LABEL: ubto2f64:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
+; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT: vcvtudq2pd %xmm0, %xmm0
+; VL-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
ret <2 x double> %1
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 78111874b58a..306b95f0f3ae 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -2602,16 +2602,16 @@ define <2 x double> @sbto2f64(<2 x double> %a) {
; GENERIC: # %bb.0:
; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00]
-; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sbto2f64:
; SKX: # %bb.0:
; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00]
-; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
@@ -2989,8 +2989,8 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50]
-; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [4:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: ubto2f64:
@@ -2998,8 +2998,8 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33]
; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
-; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [5:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
diff --git a/test/CodeGen/X86/pr34080-2.ll b/test/CodeGen/X86/pr34080-2.ll
new file mode 100644
index 000000000000..5c00f0e3706b
--- /dev/null
+++ b/test/CodeGen/X86/pr34080-2.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-openbsd6.2 | FileCheck %s
+
+%struct.DateTime = type { i64, i32, i32, i32, i32, i32, double, i8 }
+
+define void @computeJD(%struct.DateTime*) nounwind {
+; CHECK-LABEL: computeJD:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: andl $-8, %esp
+; CHECK-NEXT: subl $32, %esp
+; CHECK-NEXT: movl 8(%ebp), %ebx
+; CHECK-NEXT: movl 8(%ebx), %esi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $3, 12(%ebx)
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: subl %eax, %esi
+; CHECK-NEXT: movl $-1374389535, %ecx # imm = 0xAE147AE1
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: imull %ecx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $5, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $7, %edi
+; CHECK-NEXT: addl %eax, %edi
+; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD
+; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC
+; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $5, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: addl 16(%ebx), %ecx
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: leal 257(%ecx,%edx), %eax
+; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT: fildl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fadds {{\.LCPI.*}}
+; CHECK-NEXT: fmuls {{\.LCPI.*}}
+; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movw $3199, {{[0-9]+}}(%esp) # imm = 0xC7F
+; CHECK-NEXT: fldcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; CHECK-NEXT: fistpll {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movb $1, 36(%ebx)
+; CHECK-NEXT: imull $3600000, 20(%ebx), %eax # imm = 0x36EE80
+; CHECK-NEXT: imull $60000, 24(%ebx), %ecx # imm = 0xEA60
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: fldl 28(%ebx)
+; CHECK-NEXT: fmuls {{\.LCPI.*}}
+; CHECK-NEXT: fnstcw (%esp)
+; CHECK-NEXT: movzwl (%esp), %eax
+; CHECK-NEXT: movw $3199, (%esp) # imm = 0xC7F
+; CHECK-NEXT: fldcw (%esp)
+; CHECK-NEXT: movw %ax, (%esp)
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: sarl $31, %eax
+; CHECK-NEXT: fistpll {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldcw (%esp)
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %ecx, (%ebx)
+; CHECK-NEXT: movl %eax, 4(%ebx)
+; CHECK-NEXT: leal -12(%ebp), %esp
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: retl
+ %2 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 7
+ %3 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 1
+ %4 = load i32, i32* %3, align 4
+ %5 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 2
+ %6 = load i32, i32* %5, align 4
+ %7 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 3
+ %8 = load i32, i32* %7, align 4
+ %9 = icmp slt i32 %6, 3
+ %10 = add i32 %6, 12
+ %11 = select i1 %9, i32 %10, i32 %6
+ %12 = sext i1 %9 to i32
+ %13 = add i32 %4, %12
+ %14 = sdiv i32 %13, -100
+ %15 = sdiv i32 %13, 400
+ %16 = mul i32 %13, 36525
+ %17 = add i32 %16, 172251900
+ %18 = sdiv i32 %17, 100
+ %19 = mul i32 %11, 306001
+ %20 = add i32 %19, 306001
+ %21 = sdiv i32 %20, 10000
+ %22 = add i32 %8, 2
+ %23 = add i32 %22, %14
+ %24 = add i32 %23, %15
+ %25 = add i32 %24, 255
+ %26 = add i32 %25, %18
+ %27 = sitofp i32 %26 to double
+ %28 = fadd double %27, -1.524500e+03
+ %29 = fmul double %28, 8.640000e+07
+ %30 = fptosi double %29 to i64
+ %31 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 0
+ store i8 1, i8* %2, align 4
+ %32 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 4
+ %33 = load i32, i32* %32, align 4
+ %34 = mul i32 %33, 3600000
+ %35 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 5
+ %36 = load i32, i32* %35, align 4
+ %37 = mul i32 %36, 60000
+ %38 = add i32 %37, %34
+ %39 = sext i32 %38 to i64
+ %40 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 6
+ %41 = load double, double* %40, align 4
+ %42 = fmul double %41, 1.000000e+03
+ %43 = fptosi double %42 to i64
+ %44 = add i64 %39, %43
+ %45 = add i64 %44, %30
+ store i64 %45, i64* %31, align 4
+ ret void
+}
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/pr34080.ll b/test/CodeGen/X86/pr34080.ll
index 72dbf3c48516..e0b09745ad9e 100644
--- a/test/CodeGen/X86/pr34080.ll
+++ b/test/CodeGen/X86/pr34080.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-BROKEN
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-SCHEDULE
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 -mcpu=prescott | FileCheck %s --check-prefix=SSE3
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
@@ -46,44 +46,44 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; SSE2-BROKEN-LABEL: _Z1fe:
-; SSE2-BROKEN: ## %bb.0: ## %entry
-; SSE2-BROKEN-NEXT: pushq %rbp
-; SSE2-BROKEN-NEXT: .cfi_def_cfa_offset 16
-; SSE2-BROKEN-NEXT: .cfi_offset %rbp, -16
-; SSE2-BROKEN-NEXT: movq %rsp, %rbp
-; SSE2-BROKEN-NEXT: .cfi_def_cfa_register %rbp
-; SSE2-BROKEN-NEXT: fnstcw -4(%rbp)
-; SSE2-BROKEN-NEXT: fldt 16(%rbp)
-; SSE2-BROKEN-NEXT: movzwl -4(%rbp), %eax
-; SSE2-BROKEN-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F
-; SSE2-BROKEN-NEXT: fldcw -4(%rbp)
-; SSE2-BROKEN-NEXT: movw %ax, -4(%rbp)
-; SSE2-BROKEN-NEXT: fistl -8(%rbp)
-; SSE2-BROKEN-NEXT: fldcw -4(%rbp)
-; SSE2-BROKEN-NEXT: cvtsi2sdl -8(%rbp), %xmm0
-; SSE2-BROKEN-NEXT: movsd %xmm0, -64(%rbp)
-; SSE2-BROKEN-NEXT: movsd %xmm0, -32(%rbp)
-; SSE2-BROKEN-NEXT: fsubl -32(%rbp)
-; SSE2-BROKEN-NEXT: flds {{.*}}(%rip)
-; SSE2-BROKEN-NEXT: fnstcw -2(%rbp)
-; SSE2-BROKEN-NEXT: fmul %st(0), %st(1)
-; SSE2-BROKEN-NEXT: movzwl -2(%rbp), %eax
-; SSE2-BROKEN-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F
-; SSE2-BROKEN-NEXT: fldcw -2(%rbp)
-; SSE2-BROKEN-NEXT: movw %ax, -2(%rbp)
-; SSE2-BROKEN-NEXT: fxch %st(1)
-; SSE2-BROKEN-NEXT: fistl -12(%rbp)
-; SSE2-BROKEN-NEXT: fldcw -2(%rbp)
-; SSE2-BROKEN-NEXT: xorps %xmm0, %xmm0
-; SSE2-BROKEN-NEXT: cvtsi2sdl -12(%rbp), %xmm0
-; SSE2-BROKEN-NEXT: movsd %xmm0, -56(%rbp)
-; SSE2-BROKEN-NEXT: movsd %xmm0, -24(%rbp)
-; SSE2-BROKEN-NEXT: fsubl -24(%rbp)
-; SSE2-BROKEN-NEXT: fmulp %st(1)
-; SSE2-BROKEN-NEXT: fstpl -48(%rbp)
-; SSE2-BROKEN-NEXT: popq %rbp
-; SSE2-BROKEN-NEXT: retq
+; SSE2-SCHEDULE-LABEL: _Z1fe:
+; SSE2-SCHEDULE: ## %bb.0: ## %entry
+; SSE2-SCHEDULE-NEXT: pushq %rbp
+; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SCHEDULE-NEXT: .cfi_offset %rbp, -16
+; SSE2-SCHEDULE-NEXT: movq %rsp, %rbp
+; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-SCHEDULE-NEXT: fnstcw -4(%rbp)
+; SSE2-SCHEDULE-NEXT: fldt 16(%rbp)
+; SSE2-SCHEDULE-NEXT: movzwl -4(%rbp), %eax
+; SSE2-SCHEDULE-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F
+; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp)
+; SSE2-SCHEDULE-NEXT: movw %ax, -4(%rbp)
+; SSE2-SCHEDULE-NEXT: fistl -8(%rbp)
+; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp)
+; SSE2-SCHEDULE-NEXT: cvtsi2sdl -8(%rbp), %xmm0
+; SSE2-SCHEDULE-NEXT: movsd %xmm0, -64(%rbp)
+; SSE2-SCHEDULE-NEXT: movsd %xmm0, -32(%rbp)
+; SSE2-SCHEDULE-NEXT: fsubl -32(%rbp)
+; SSE2-SCHEDULE-NEXT: flds {{.*}}(%rip)
+; SSE2-SCHEDULE-NEXT: fnstcw -2(%rbp)
+; SSE2-SCHEDULE-NEXT: fmul %st(0), %st(1)
+; SSE2-SCHEDULE-NEXT: movzwl -2(%rbp), %eax
+; SSE2-SCHEDULE-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F
+; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp)
+; SSE2-SCHEDULE-NEXT: movw %ax, -2(%rbp)
+; SSE2-SCHEDULE-NEXT: fxch %st(1)
+; SSE2-SCHEDULE-NEXT: fistl -12(%rbp)
+; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp)
+; SSE2-SCHEDULE-NEXT: xorps %xmm0, %xmm0
+; SSE2-SCHEDULE-NEXT: cvtsi2sdl -12(%rbp), %xmm0
+; SSE2-SCHEDULE-NEXT: movsd %xmm0, -56(%rbp)
+; SSE2-SCHEDULE-NEXT: movsd %xmm0, -24(%rbp)
+; SSE2-SCHEDULE-NEXT: fsubl -24(%rbp)
+; SSE2-SCHEDULE-NEXT: fmulp %st(1)
+; SSE2-SCHEDULE-NEXT: fstpl -48(%rbp)
+; SSE2-SCHEDULE-NEXT: popq %rbp
+; SSE2-SCHEDULE-NEXT: retq
;
; SSE3-LABEL: _Z1fe:
; SSE3: ## %bb.0: ## %entry
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 5700b1df15bd..a516c709517d 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
@c = external global i32*, align 8
@@ -11,42 +15,69 @@
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi8:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movzwl (%edx,%ecx), %edx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movzwl (%eax,%ecx), %eax
-; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm1
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT: movd %edx, %xmm0
+; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT: movd %eax, %xmm1
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm1
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -71,38 +102,63 @@ entry:
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_4xi8:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_4xi8:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_4xi8:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_4xi8:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_4xi8:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_4xi8:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -127,44 +183,106 @@ entry:
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_8xi8:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_8xi8:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_8xi8:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: mul_8xi8:
+; X86-AVX1: # %bb.0: # %entry
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %esi, -8
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: movl c, %esi
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: mul_8xi8:
+; X86-AVX2: # %bb.0: # %entry
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT: movl c, %esi
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: mul_8xi8:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: mul_8xi8:
+; X64-AVX1: # %bb.0: # %entry
+; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_8xi8:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -189,64 +307,150 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_16xi8:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-NEXT: movdqu (%eax,%ecx), %xmm1
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X86-NEXT: movdqa %xmm1, %xmm4
-; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-NEXT: pmullw %xmm3, %xmm4
-; X86-NEXT: movdqa %xmm4, %xmm3
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_16xi8:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-NEXT: movdqu (%rsi,%rdx), %xmm1
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movdqa %xmm1, %xmm4
-; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-NEXT: pmullw %xmm3, %xmm4
-; X64-NEXT: movdqa %xmm4, %xmm3
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_16xi8:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X86-SSE-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm3, %xmm4
+; X86-SSE-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: mul_16xi8:
+; X86-AVX1: # %bb.0: # %entry
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %esi, -8
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: movl c, %esi
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: mul_16xi8:
+; X86-AVX2: # %bb.0: # %entry
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT: movl c, %esi
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: mul_16xi8:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-SSE-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm3, %xmm4
+; X64-SSE-NEXT: movdqa %xmm4, %xmm3
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: mul_16xi8:
+; X64-AVX1: # %bb.0: # %entry
+; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_16xi8:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -271,36 +475,65 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi16:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pmulhuw %xmm0, %xmm2
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pmulhuw %xmm0, %xmm2
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -325,36 +558,61 @@ entry:
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_4xi16:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pmulhuw %xmm0, %xmm2
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_4xi16:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pmulhuw %xmm0, %xmm2
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_4xi16:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_4xi16:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_4xi16:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_4xi16:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -379,42 +637,104 @@ entry:
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_8xi16:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-NEXT: movdqu (%eax,%ecx), %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pmulhuw %xmm0, %xmm2
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_8xi16:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-NEXT: movdqu (%rsi,%rdx), %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pmulhuw %xmm0, %xmm2
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_8xi16:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: mul_8xi16:
+; X86-AVX1: # %bb.0: # %entry
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %esi, -8
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: movl c, %esi
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: mul_8xi16:
+; X86-AVX2: # %bb.0: # %entry
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT: movl c, %esi
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: mul_8xi16:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: mul_8xi16:
+; X64-AVX1: # %bb.0: # %entry
+; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_8xi16:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -439,62 +759,148 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_16xi16:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1
-; X86-NEXT: movdqu (%eax,%ecx), %xmm2
-; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3
-; X86-NEXT: movdqa %xmm2, %xmm4
-; X86-NEXT: pmulhuw %xmm0, %xmm4
-; X86-NEXT: pmullw %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT: movdqa %xmm3, %xmm4
-; X86-NEXT: pmulhuw %xmm1, %xmm4
-; X86-NEXT: pmullw %xmm1, %xmm3
-; X86-NEXT: movdqa %xmm3, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_16xi16:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; X64-NEXT: movdqu (%rsi,%rdx), %xmm2
-; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; X64-NEXT: movdqa %xmm2, %xmm4
-; X64-NEXT: pmulhuw %xmm0, %xmm4
-; X64-NEXT: pmullw %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movdqa %xmm3, %xmm4
-; X64-NEXT: pmulhuw %xmm1, %xmm4
-; X64-NEXT: pmullw %xmm1, %xmm3
-; X64-NEXT: movdqa %xmm3, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_16xi16:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
+; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
+; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
+; X86-SSE-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
+; X86-SSE-NEXT: pmullw %xmm0, %xmm2
+; X86-SSE-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
+; X86-SSE-NEXT: pmullw %xmm1, %xmm3
+; X86-SSE-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: mul_16xi16:
+; X86-AVX1: # %bb.0: # %entry
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %esi, -8
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: movl c, %esi
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: mul_16xi16:
+; X86-AVX2: # %bb.0: # %entry
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT: movl c, %esi
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: mul_16xi16:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
+; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; X64-SSE-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
+; X64-SSE-NEXT: pmullw %xmm0, %xmm2
+; X64-SSE-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-SSE-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
+; X64-SSE-NEXT: pmullw %xmm1, %xmm3
+; X64-SSE-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: mul_16xi16:
+; X64-AVX1: # %bb.0: # %entry
+; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_16xi16:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -519,46 +925,73 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi8_sext:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movzwl (%edx,%ecx), %edx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movzwl (%eax,%ecx), %eax
-; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm0
-; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm1
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_sext:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm1
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm0
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm1
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_sext:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT: movd %edx, %xmm0
+; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT: movd %eax, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm1
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: psrad $16, %xmm0
+; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_sext:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_sext:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm1
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: psrad $16, %xmm0
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_sext:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -583,48 +1016,75 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi8_sext_zext:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movzwl (%edx,%ecx), %edx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movzwl (%eax,%ecx), %eax
-; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm0
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pmulhw %xmm0, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_sext_zext:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-NEXT: movd %ecx, %xmm1
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm0
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pmulhw %xmm0, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_sext_zext:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT: movd %edx, %xmm0
+; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT: movd %eax, %xmm1
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_sext_zext:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_sext_zext:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm1
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_sext_zext:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -649,36 +1109,63 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi16_sext:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pmulhw %xmm0, %xmm2
-; X86-NEXT: pmullw %xmm0, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_sext:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pmulhw %xmm0, %xmm2
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_sext:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_sext:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_sext:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_sext:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -703,62 +1190,93 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_2xi16_sext_zext:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: psrlq $32, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: psrlq $32, %xmm3
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: paddq %xmm2, %xmm3
-; X86-NEXT: psllq $32, %xmm3
-; X86-NEXT: pmuludq %xmm0, %xmm1
-; X86-NEXT: paddq %xmm3, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_sext_zext:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: psrlq $32, %xmm2
-; X64-NEXT: pmuludq %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: psrlq $32, %xmm3
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: paddq %xmm2, %xmm3
-; X64-NEXT: psllq $32, %xmm3
-; X64-NEXT: pmuludq %xmm0, %xmm1
-; X64-NEXT: paddq %xmm3, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_sext_zext:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SSE-NEXT: psrad $16, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X86-SSE-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE-NEXT: psrlq $32, %xmm2
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X86-SSE-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE-NEXT: psrlq $32, %xmm3
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
+; X86-SSE-NEXT: paddq %xmm2, %xmm3
+; X86-SSE-NEXT: psllq $32, %xmm3
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT: paddq %xmm3, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_sext_zext:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: .cfi_offset %esi, -8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: movl c, %esi
+; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_sext_zext:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SSE-NEXT: psrad $16, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X64-SSE-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE-NEXT: psrlq $32, %xmm2
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X64-SSE-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE-NEXT: psrlq $32, %xmm3
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
+; X64-SSE-NEXT: paddq %xmm2, %xmm3
+; X64-SSE-NEXT: psllq $32, %xmm3
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT: paddq %xmm3, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_sext_zext:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -783,62 +1301,148 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; X86-LABEL: mul_16xi16_sext:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl c, %esi
-; X86-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1
-; X86-NEXT: movdqu (%eax,%ecx), %xmm2
-; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3
-; X86-NEXT: movdqa %xmm2, %xmm4
-; X86-NEXT: pmulhw %xmm0, %xmm4
-; X86-NEXT: pmullw %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT: movdqa %xmm3, %xmm4
-; X86-NEXT: pmulhw %xmm1, %xmm4
-; X86-NEXT: pmullw %xmm1, %xmm3
-; X86-NEXT: movdqa %xmm3, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
-; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_16xi16_sext:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; X64-NEXT: movdqu (%rsi,%rdx), %xmm2
-; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; X64-NEXT: movdqa %xmm2, %xmm4
-; X64-NEXT: pmulhw %xmm0, %xmm4
-; X64-NEXT: pmullw %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movdqa %xmm3, %xmm4
-; X64-NEXT: pmulhw %xmm1, %xmm4
-; X64-NEXT: pmullw %xmm1, %xmm3
-; X64-NEXT: movdqa %xmm3, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
-; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_16xi16_sext:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %esi, -8
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl c, %esi
+; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
+; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
+; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
+; X86-SSE-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
+; X86-SSE-NEXT: pmullw %xmm0, %xmm2
+; X86-SSE-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
+; X86-SSE-NEXT: pmullw %xmm1, %xmm3
+; X86-SSE-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: mul_16xi16_sext:
+; X86-AVX1: # %bb.0: # %entry
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %esi, -8
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: movl c, %esi
+; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0
+; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1
+; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2
+; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3
+; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
+; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: mul_16xi16_sext:
+; X86-AVX2: # %bb.0: # %entry
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT: movl c, %esi
+; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
+; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
+; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
+; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
+; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: mul_16xi16_sext:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
+; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; X64-SSE-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
+; X64-SSE-NEXT: pmullw %xmm0, %xmm2
+; X64-SSE-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-SSE-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
+; X64-SSE-NEXT: pmullw %xmm1, %xmm3
+; X64-SSE-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: mul_16xi16_sext:
+; X64-AVX1: # %bb.0: # %entry
+; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0
+; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1
+; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2
+; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3
+; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_16xi16_sext:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
+; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
+; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
+; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
+; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -862,31 +1466,54 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst1:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst1:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst1:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst1:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst1:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst1:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: movl $255, %ecx
+; X64-AVX-NEXT: vmovq %rcx, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -906,33 +1533,53 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst2:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm0
-; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst2:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm0
-; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst2:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE-NEXT: psrad $16, %xmm0
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst2:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst2:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE-NEXT: psrad $16, %xmm0
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst2:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -952,37 +1599,60 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst3:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst3:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst3:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst3:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst3:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst3:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100
+; X64-AVX-NEXT: vmovq %rcx, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1002,37 +1672,57 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst4:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst4:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst4:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst4:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst4:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst4:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1052,37 +1742,57 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst5:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm0
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst5:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm0
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst5:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst5:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst5:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst5:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1102,37 +1812,57 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi8_varconst6:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: psraw $8, %xmm0
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi8_varconst6:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: psraw $8, %xmm0
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi8_varconst6:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst6:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst6:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst6:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1152,31 +1882,58 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi16_varconst1:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhuw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_varconst1:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhuw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_varconst1:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst1:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst1:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst1:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; X64-AVX-NEXT: vmovq %rcx, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1196,31 +1953,51 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi16_varconst2:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmulhw %xmm1, %xmm2
-; X86-NEXT: pmullw %xmm1, %xmm0
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_varconst2:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmulhw %xmm1, %xmm2
-; X64-NEXT: pmullw %xmm1, %xmm0
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_varconst2:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst2:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst2:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst2:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1240,45 +2017,72 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi16_varconst3:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: psrlq $32, %xmm0
-; X86-NEXT: pmuludq %xmm1, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_varconst3:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: movl $65536, %ecx # imm = 0x10000
-; X64-NEXT: movq %rcx, %xmm1
-; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: psrlq $32, %xmm0
-; X64-NEXT: pmuludq %xmm1, %xmm0
-; X64-NEXT: psllq $32, %xmm0
-; X64-NEXT: paddq %xmm2, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_varconst3:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0]
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT: psrlq $32, %xmm0
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT: psllq $32, %xmm0
+; X86-SSE-NEXT: paddq %xmm2, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst3:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst3:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
+; X64-SSE-NEXT: movq %rcx, %xmm1
+; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X64-SSE-NEXT: psrlq $32, %xmm0
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT: psllq $32, %xmm0
+; X64-SSE-NEXT: paddq %xmm2, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst3:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000
+; X64-AVX-NEXT: vmovq %rcx, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1298,45 +2102,68 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-LABEL: mul_2xi16_varconst4:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl c, %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: psrlq $32, %xmm0
-; X86-NEXT: pmuludq %xmm1, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-NEXT: retl
-;
-; X64-LABEL: mul_2xi16_varconst4:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*}}(%rip), %rax
-; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: movl $32768, %ecx # imm = 0x8000
-; X64-NEXT: movq %rcx, %xmm1
-; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: psrlq $32, %xmm0
-; X64-NEXT: pmuludq %xmm1, %xmm0
-; X64-NEXT: psllq $32, %xmm0
-; X64-NEXT: paddq %xmm2, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-NEXT: retq
+; X86-SSE-LABEL: mul_2xi16_varconst4:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl c, %edx
+; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SSE-NEXT: psrad $16, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT: psrlq $32, %xmm0
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT: psllq $32, %xmm0
+; X86-SSE-NEXT: paddq %xmm2, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst4:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl c, %edx
+; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst4:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SSE-NEXT: psrad $16, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
+; X64-SSE-NEXT: movq %rcx, %xmm1
+; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X64-SSE-NEXT: psrlq $32, %xmm0
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT: psllq $32, %xmm0
+; X64-SSE-NEXT: paddq %xmm2, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst4:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000
+; X64-AVX-NEXT: vmovq %rcx, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -1355,99 +2182,389 @@ entry:
;
define void @PR34947() {
-; X86-LABEL: PR34947:
-; X86: # %bb.0:
-; X86-NEXT: movdqa (%eax), %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; X86-NEXT: movd %xmm1, %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ecx
-; X86-NEXT: movd %edx, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X86-NEXT: movd %xmm2, %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ecx
-; X86-NEXT: movd %edx, %xmm2
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ecx
-; X86-NEXT: movd %edx, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ecx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl (%eax)
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-NEXT: pmuludq %xmm2, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT: movl $8199, %eax # imm = 0x2007
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm2
-; X86-NEXT: movd %xmm2, (%eax)
-; X86-NEXT: movdqa %xmm1, (%eax)
-; X86-NEXT: retl
-;
-; X64-LABEL: PR34947:
-; X64: # %bb.0:
-; X64-NEXT: movdqa (%rax), %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; X64-NEXT: movd %xmm1, %ecx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ecx
-; X64-NEXT: movd %edx, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X64-NEXT: movd %xmm2, %ecx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ecx
-; X64-NEXT: movd %edx, %xmm2
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ecx
-; X64-NEXT: movd %edx, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ecx
-; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl (%rax)
-; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-NEXT: pmuludq %xmm2, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-NEXT: pmuludq %xmm2, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT: movl $8199, %eax # imm = 0x2007
-; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pmuludq %xmm0, %xmm2
-; X64-NEXT: movd %xmm2, (%rax)
-; X64-NEXT: movdqa %xmm1, (%rax)
-; X64-NEXT: retq
+; X86-SSE-LABEL: PR34947:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movdqa (%eax), %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X86-SSE-NEXT: movd %xmm1, %ecx
+; X86-SSE-NEXT: xorl %eax, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl %ecx
+; X86-SSE-NEXT: movd %edx, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: movd %xmm2, %ecx
+; X86-SSE-NEXT: xorl %eax, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl %ecx
+; X86-SSE-NEXT: movd %edx, %xmm2
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-SSE-NEXT: movd %xmm0, %ecx
+; X86-SSE-NEXT: xorl %eax, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl %ecx
+; X86-SSE-NEXT: movd %edx, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE-NEXT: movd %xmm0, %ecx
+; X86-SSE-NEXT: xorl %eax, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl %ecx
+; X86-SSE-NEXT: movd %edx, %xmm0
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: xorl %eax, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl (%eax)
+; X86-SSE-NEXT: movd %edx, %xmm0
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007
+; X86-SSE-NEXT: movd %eax, %xmm2
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X86-SSE-NEXT: movd %xmm2, (%eax)
+; X86-SSE-NEXT: movdqa %xmm1, (%eax)
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: PR34947:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: pushl %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: pushl %ebx
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 12
+; X86-AVX1-NEXT: pushl %edi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 16
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 20
+; X86-AVX1-NEXT: subl $16, %esp
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 36
+; X86-AVX1-NEXT: .cfi_offset %esi, -20
+; X86-AVX1-NEXT: .cfi_offset %edi, -16
+; X86-AVX1-NEXT: .cfi_offset %ebx, -12
+; X86-AVX1-NEXT: .cfi_offset %ebp, -8
+; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: divl (%eax)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: divl %ecx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: divl %ecx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: divl %ecx
+; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-AVX1-NEXT: vmovd %xmm0, %ecx
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: divl %ecx
+; X86-AVX1-NEXT: movl %edx, %ebp
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
+; X86-AVX1-NEXT: divl %ecx
+; X86-AVX1-NEXT: movl %edx, %ecx
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi
+; X86-AVX1-NEXT: divl %esi
+; X86-AVX1-NEXT: movl %edx, %esi
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi
+; X86-AVX1-NEXT: divl %edi
+; X86-AVX1-NEXT: movl %edx, %edi
+; X86-AVX1-NEXT: xorl %eax, %eax
+; X86-AVX1-NEXT: xorl %edx, %edx
+; X86-AVX1-NEXT: vmovd %xmm0, %ebx
+; X86-AVX1-NEXT: divl %ebx
+; X86-AVX1-NEXT: vmovd %edx, %xmm0
+; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %ebp, %xmm1
+; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload
+; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
+; X86-AVX1-NEXT: vmovd %eax, %xmm3
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
+; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1
+; X86-AVX1-NEXT: vmovd %xmm1, (%eax)
+; X86-AVX1-NEXT: vmovaps %ymm0, (%eax)
+; X86-AVX1-NEXT: addl $16, %esp
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: popl %edi
+; X86-AVX1-NEXT: popl %ebx
+; X86-AVX1-NEXT: popl %ebp
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: PR34947:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: pushl %esi
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %esi, -8
+; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: movl %edx, %ecx
+; X86-AVX2-NEXT: vmovd %xmm1, %esi
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %esi
+; X86-AVX2-NEXT: vmovd %edx, %xmm2
+; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: movl %edx, %ecx
+; X86-AVX2-NEXT: vmovd %xmm0, %esi
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %esi
+; X86-AVX2-NEXT: vmovd %edx, %xmm2
+; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: xorl %eax, %eax
+; X86-AVX2-NEXT: xorl %edx, %edx
+; X86-AVX2-NEXT: divl (%eax)
+; X86-AVX2-NEXT: vmovd %edx, %xmm1
+; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
+; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
+; X86-AVX2-NEXT: vmovd %eax, %xmm2
+; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT: vmovd %xmm1, (%eax)
+; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
+; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: PR34947:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movdqa (%rax), %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X64-SSE-NEXT: movd %xmm1, %ecx
+; X64-SSE-NEXT: xorl %eax, %eax
+; X64-SSE-NEXT: xorl %edx, %edx
+; X64-SSE-NEXT: divl %ecx
+; X64-SSE-NEXT: movd %edx, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: movd %xmm2, %ecx
+; X64-SSE-NEXT: xorl %eax, %eax
+; X64-SSE-NEXT: xorl %edx, %edx
+; X64-SSE-NEXT: divl %ecx
+; X64-SSE-NEXT: movd %edx, %xmm2
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-SSE-NEXT: movd %xmm0, %ecx
+; X64-SSE-NEXT: xorl %eax, %eax
+; X64-SSE-NEXT: xorl %edx, %edx
+; X64-SSE-NEXT: divl %ecx
+; X64-SSE-NEXT: movd %edx, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-SSE-NEXT: movd %xmm0, %ecx
+; X64-SSE-NEXT: xorl %eax, %eax
+; X64-SSE-NEXT: xorl %edx, %edx
+; X64-SSE-NEXT: divl %ecx
+; X64-SSE-NEXT: movd %edx, %xmm0
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE-NEXT: xorl %eax, %eax
+; X64-SSE-NEXT: xorl %edx, %edx
+; X64-SSE-NEXT: divl (%rax)
+; X64-SSE-NEXT: movd %edx, %xmm0
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm3
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007
+; X64-SSE-NEXT: movd %eax, %xmm2
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X64-SSE-NEXT: movd %xmm2, (%rax)
+; X64-SSE-NEXT: movdqa %xmm1, (%rax)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: PR34947:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rbp
+; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
+; X64-AVX1-NEXT: pushq %rbx
+; X64-AVX1-NEXT: .cfi_def_cfa_offset 24
+; X64-AVX1-NEXT: .cfi_offset %rbx, -24
+; X64-AVX1-NEXT: .cfi_offset %rbp, -16
+; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl (%rax)
+; X64-AVX1-NEXT: movl %edx, %r8d
+; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %r9d
+; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %r10d
+; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %r11d
+; X64-AVX1-NEXT: vmovd %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %esi
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %edi
+; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %ecx
+; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ebx
+; X64-AVX1-NEXT: movl %edx, %ebx
+; X64-AVX1-NEXT: vmovd %xmm0, %ebp
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: xorl %edx, %edx
+; X64-AVX1-NEXT: divl %ebp
+; X64-AVX1-NEXT: vmovd %edx, %xmm0
+; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %esi, %xmm2
+; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovd %r8d, %xmm1
+; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
+; X64-AVX1-NEXT: vmovd %eax, %xmm2
+; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT: vmovd %xmm1, (%rax)
+; X64-AVX1-NEXT: vmovaps %ymm0, (%rax)
+; X64-AVX1-NEXT: popq %rbx
+; X64-AVX1-NEXT: popq %rbp
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: PR34947:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: movl %edx, %ecx
+; X64-AVX2-NEXT: vmovd %xmm1, %esi
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %esi
+; X64-AVX2-NEXT: vmovd %edx, %xmm2
+; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1
+; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: movl %edx, %ecx
+; X64-AVX2-NEXT: vmovd %xmm0, %esi
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %esi
+; X64-AVX2-NEXT: vmovd %edx, %xmm2
+; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: xorl %edx, %edx
+; X64-AVX2-NEXT: divl (%rax)
+; X64-AVX2-NEXT: vmovd %edx, %xmm1
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
+; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
+; X64-AVX2-NEXT: vmovd %eax, %xmm2
+; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovd %xmm1, (%rax)
+; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%tmp = load <9 x i32>, <9 x i32>* undef, align 64
%rem = urem <9 x i32> zeroinitializer, %tmp
%mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index 5175850c734f..a799b0e6f12d 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64-AVX2
define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
; X32-LABEL: and_masks:
@@ -31,6 +33,37 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
; X64-NEXT: vmovaps %ymm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; X32-AVX2-LABEL: and_masks:
+; X32-AVX2: ## %bb.0:
+; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-AVX2-NEXT: vmovups (%edx), %ymm0
+; X32-AVX2-NEXT: vmovups (%ecx), %ymm1
+; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X32-AVX2-NEXT: vmovups (%eax), %ymm2
+; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vmovaps %ymm0, (%eax)
+; X32-AVX2-NEXT: vzeroupper
+; X32-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: and_masks:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vmovups (%rdi), %ymm0
+; X64-AVX2-NEXT: vmovups (%rsi), %ymm1
+; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovups (%rdx), %ymm2
+; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovaps %ymm0, (%rax)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
@@ -62,6 +95,28 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
; X64-NEXT: vmovaps %ymm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; X32-AVX2-LABEL: neg_masks:
+; X32-AVX2: ## %bb.0:
+; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX2-NEXT: vmovups (%ecx), %ymm0
+; X32-AVX2-NEXT: vcmpnltps (%eax), %ymm0, %ymm0
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vmovaps %ymm0, (%eax)
+; X32-AVX2-NEXT: vzeroupper
+; X32-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: neg_masks:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vmovups (%rsi), %ymm0
+; X64-AVX2-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovaps %ymm0, (%rax)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
@@ -71,3 +126,50 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
ret void
}
+define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
+; X32-LABEL: and_mask_constant:
+; X32: ## %bb.0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; X32-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; X32-NEXT: vpand LCPI2_0, %xmm0, %xmm0
+; X32-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: and_mask_constant:
+; X64: ## %bb.0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+;
+; X32-AVX2-LABEL: and_mask_constant:
+; X32-AVX2: ## %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpand LCPI2_0, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: and_mask_constant:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: retq
+ %m = icmp eq <8 x i32> %v0, zeroinitializer
+ %mand = and <8 x i1> %m, <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>
+ %r = zext <8 x i1> %mand to <8 x i32>
+ ret <8 x i32> %r
+}
diff --git a/test/MC/ELF/comdat-name-number.s b/test/MC/ELF/comdat-name-number.s
new file mode 100644
index 000000000000..21e2ed7399f0
--- /dev/null
+++ b/test/MC/ELF/comdat-name-number.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -filetype=obj -o %t.o
+// RUN: llvm-readobj -elf-section-groups %t.o | FileCheck %s
+
+// Test that we can handle numeric COMDAT names.
+
+.section .foo,"G",@progbits,123,comdat
+.section .bar,"G",@progbits,abc,comdat
+
+// CHECK: Groups {
+// CHECK-NEXT: Group {
+// CHECK-NEXT: Name: .group
+// CHECK-NEXT: Index:
+// CHECK-NEXT: Type: COMDAT
+// CHECK-NEXT: Signature: 123
+// CHECK-NEXT: Section(s) in group [
+// CHECK-NEXT: .foo
+// CHECK-NEXT: ]
+// CHECK-NEXT: }
+// CHECK-NEXT: Group {
+// CHECK-NEXT: Name: .group
+// CHECK-NEXT: Index:
+// CHECK-NEXT: Type: COMDAT
+// CHECK-NEXT: Signature: abc
+// CHECK-NEXT: Section(s) in group [
+// CHECK-NEXT: .bar
+// CHECK-NEXT: ]
+// CHECK-NEXT: }
+// CHECK-NEXT: }